Eric Lee / smarc-ti-linux-kernel

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* Memory thresholds

9

* Memory thresholds

10

11

* Author: Kirill A. Shutemov

11

* Author: Kirill A. Shutemov

12

*

12

*

13

* Kernel Memory Controller

13

* Kernel Memory Controller

14

15

* Authors: Glauber Costa and Suleiman Souhlal

15

* Authors: Glauber Costa and Suleiman Souhlal

16

*

16

*

17

* This program is free software; you can redistribute it and/or modify

17

* This program is free software; you can redistribute it and/or modify

18

* it under the terms of the GNU General Public License as published by

18

* it under the terms of the GNU General Public License as published by

19

* the Free Software Foundation; either version 2 of the License, or

19

* the Free Software Foundation; either version 2 of the License, or

20

* (at your option) any later version.

20

* (at your option) any later version.

21

*

21

*

22

* This program is distributed in the hope that it will be useful,

22

* This program is distributed in the hope that it will be useful,

23

* but WITHOUT ANY WARRANTY; without even the implied warranty of

23

* but WITHOUT ANY WARRANTY; without even the implied warranty of

24

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

24

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

25

* GNU General Public License for more details.

25

* GNU General Public License for more details.

26

*/

26

*/

27

28

#include <linux/res_counter.h>

28

#include <linux/res_counter.h>

29

#include <linux/memcontrol.h>

29

#include <linux/memcontrol.h>

30

#include <linux/cgroup.h>

30

#include <linux/cgroup.h>

31

#include <linux/mm.h>

31

#include <linux/mm.h>

32

#include <linux/hugetlb.h>

32

#include <linux/hugetlb.h>

33

#include <linux/pagemap.h>

33

#include <linux/pagemap.h>

34

#include <linux/smp.h>

34

#include <linux/smp.h>

35

#include <linux/page-flags.h>

35

#include <linux/page-flags.h>

36

#include <linux/backing-dev.h>

36

#include <linux/backing-dev.h>

37

#include <linux/bit_spinlock.h>

37

#include <linux/bit_spinlock.h>

38

#include <linux/rcupdate.h>

38

#include <linux/rcupdate.h>

39

#include <linux/limits.h>

39

#include <linux/limits.h>

40

#include <linux/export.h>

40

#include <linux/export.h>

41

#include <linux/mutex.h>

41

#include <linux/mutex.h>

42

#include <linux/rbtree.h>

42

#include <linux/rbtree.h>

43

#include <linux/slab.h>

43

#include <linux/slab.h>

44

#include <linux/swap.h>

44

#include <linux/swap.h>

45

#include <linux/swapops.h>

45

#include <linux/swapops.h>

46

#include <linux/spinlock.h>

46

#include <linux/spinlock.h>

47

#include <linux/eventfd.h>

47

#include <linux/eventfd.h>

48

#include <linux/poll.h>

48

#include <linux/poll.h>

49

#include <linux/sort.h>

49

#include <linux/sort.h>

50

#include <linux/fs.h>

50

#include <linux/fs.h>

51

#include <linux/seq_file.h>

51

#include <linux/seq_file.h>

52

#include <linux/vmpressure.h>

52

#include <linux/vmpressure.h>

53

#include <linux/mm_inline.h>

53

#include <linux/mm_inline.h>

54

#include <linux/page_cgroup.h>

54

#include <linux/page_cgroup.h>

55

#include <linux/cpu.h>

55

#include <linux/cpu.h>

56

#include <linux/oom.h>

56

#include <linux/oom.h>

57

#include <linux/lockdep.h>

57

#include <linux/lockdep.h>

58

#include <linux/file.h>

58

#include <linux/file.h>

59

#include "internal.h"

59

#include "internal.h"

60

#include <net/sock.h>

60

#include <net/sock.h>

61

#include <net/ip.h>

61

#include <net/ip.h>

62

#include <net/tcp_memcontrol.h>

62

#include <net/tcp_memcontrol.h>

63

#include "slab.h"

63

#include "slab.h"

64

65

#include <asm/uaccess.h>

65

#include <asm/uaccess.h>

66

67

#include <trace/events/vmscan.h>

67

#include <trace/events/vmscan.h>

68

69

struct cgroup_subsys memory_cgrp_subsys __read_mostly;

69

struct cgroup_subsys memory_cgrp_subsys __read_mostly;

70

EXPORT_SYMBOL(memory_cgrp_subsys);

70

EXPORT_SYMBOL(memory_cgrp_subsys);

71

72

#define MEM_CGROUP_RECLAIM_RETRIES 5

72

#define MEM_CGROUP_RECLAIM_RETRIES 5

73

static struct mem_cgroup *root_mem_cgroup __read_mostly;

73

static struct mem_cgroup *root_mem_cgroup __read_mostly;

74

75

#ifdef CONFIG_MEMCG_SWAP

75

#ifdef CONFIG_MEMCG_SWAP

76

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

76

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

77

int do_swap_account __read_mostly;

77

int do_swap_account __read_mostly;

78

79

/* for remember boot option*/

79

/* for remember boot option*/

80

#ifdef CONFIG_MEMCG_SWAP_ENABLED

80

#ifdef CONFIG_MEMCG_SWAP_ENABLED

81

static int really_do_swap_account __initdata = 1;

81

static int really_do_swap_account __initdata = 1;

82

#else

82

#else

83

static int really_do_swap_account __initdata = 0;

83

static int really_do_swap_account __initdata = 0;

84

#endif

84

#endif

85

86

#else

86

#else

87

#define do_swap_account 0

87

#define do_swap_account 0

88

#endif

88

#endif

89

90

91

static const char * const mem_cgroup_stat_names[] = {

91

static const char * const mem_cgroup_stat_names[] = {

92

"cache",

92

"cache",

93

"rss",

93

"rss",

94

"rss_huge",

94

"rss_huge",

95

"mapped_file",

95

"mapped_file",

96

"writeback",

96

"writeback",

97

"swap",

97

"swap",

98

};

98

};

99

100

enum mem_cgroup_events_index {

100

enum mem_cgroup_events_index {

101

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

101

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

102

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

102

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

103

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

103

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

104

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

104

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

105

MEM_CGROUP_EVENTS_NSTATS,

105

MEM_CGROUP_EVENTS_NSTATS,

106

};

106

};

107

108

static const char * const mem_cgroup_events_names[] = {

108

static const char * const mem_cgroup_events_names[] = {

109

"pgpgin",

109

"pgpgin",

110

"pgpgout",

110

"pgpgout",

111

"pgfault",

111

"pgfault",

112

"pgmajfault",

112

"pgmajfault",

113

};

113

};

114

115

static const char * const mem_cgroup_lru_names[] = {

115

static const char * const mem_cgroup_lru_names[] = {

116

"inactive_anon",

116

"inactive_anon",

117

"active_anon",

117

"active_anon",

118

"inactive_file",

118

"inactive_file",

119

"active_file",

119

"active_file",

120

"unevictable",

120

"unevictable",

121

};

121

};

122

123

/*

123

/*

124

* Per memcg event counter is incremented at every pagein/pageout. With THP,

124

* Per memcg event counter is incremented at every pagein/pageout. With THP,

125

* it will be incremated by the number of pages. This counter is used for

125

* it will be incremated by the number of pages. This counter is used for

126

* for trigger some periodic events. This is straightforward and better

126

* for trigger some periodic events. This is straightforward and better

127

* than using jiffies etc. to handle periodic memcg event.

127

* than using jiffies etc. to handle periodic memcg event.

128

*/

128

*/

129

enum mem_cgroup_events_target {

129

enum mem_cgroup_events_target {

130

MEM_CGROUP_TARGET_THRESH,

130

MEM_CGROUP_TARGET_THRESH,

131

MEM_CGROUP_TARGET_SOFTLIMIT,

131

MEM_CGROUP_TARGET_SOFTLIMIT,

132

MEM_CGROUP_TARGET_NUMAINFO,

132

MEM_CGROUP_TARGET_NUMAINFO,

133

MEM_CGROUP_NTARGETS,

133

MEM_CGROUP_NTARGETS,

134

};

134

};

135

#define THRESHOLDS_EVENTS_TARGET 128

135

#define THRESHOLDS_EVENTS_TARGET 128

136

#define SOFTLIMIT_EVENTS_TARGET 1024

136

#define SOFTLIMIT_EVENTS_TARGET 1024

137

#define NUMAINFO_EVENTS_TARGET 1024

137

#define NUMAINFO_EVENTS_TARGET 1024

138

139

struct mem_cgroup_stat_cpu {

139

struct mem_cgroup_stat_cpu {

140

long count[MEM_CGROUP_STAT_NSTATS];

140

long count[MEM_CGROUP_STAT_NSTATS];

141

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

141

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

142

unsigned long nr_page_events;

142

unsigned long nr_page_events;

143

unsigned long targets[MEM_CGROUP_NTARGETS];

143

unsigned long targets[MEM_CGROUP_NTARGETS];

144

};

144

};

145

146

struct mem_cgroup_reclaim_iter {

146

struct mem_cgroup_reclaim_iter {

147

/*

147

/*

148

* last scanned hierarchy member. Valid only if last_dead_count

148

* last scanned hierarchy member. Valid only if last_dead_count

149

* matches memcg->dead_count of the hierarchy root group.

149

* matches memcg->dead_count of the hierarchy root group.

150

*/

150

*/

151

struct mem_cgroup *last_visited;

151

struct mem_cgroup *last_visited;

152

int last_dead_count;

152

int last_dead_count;

153

154

/* scan generation, increased every round-trip */

154

/* scan generation, increased every round-trip */

155

unsigned int generation;

155

unsigned int generation;

156

};

156

};

157

158

/*

158

/*

159

* per-zone information in memory controller.

159

* per-zone information in memory controller.

160

*/

160

*/

161

struct mem_cgroup_per_zone {

161

struct mem_cgroup_per_zone {

162

struct lruvec lruvec;

162

struct lruvec lruvec;

163

unsigned long lru_size[NR_LRU_LISTS];

163

unsigned long lru_size[NR_LRU_LISTS];

164

165

struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

165

struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];

166

167

struct rb_node tree_node; /* RB tree node */

167

struct rb_node tree_node; /* RB tree node */

168

unsigned long long usage_in_excess;/* Set to the value by which */

168

unsigned long long usage_in_excess;/* Set to the value by which */

169

/* the soft limit is exceeded*/

169

/* the soft limit is exceeded*/

170

bool on_tree;

170

bool on_tree;

171

struct mem_cgroup *memcg; /* Back pointer, we cannot */

171

struct mem_cgroup *memcg; /* Back pointer, we cannot */

172

/* use container_of */

172

/* use container_of */

173

};

173

};

174

175

struct mem_cgroup_per_node {

175

struct mem_cgroup_per_node {

176

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

176

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

177

};

177

};

178

179

/*

179

/*

180

* Cgroups above their limits are maintained in a RB-Tree, independent of

180

* Cgroups above their limits are maintained in a RB-Tree, independent of

181

* their hierarchy representation

181

* their hierarchy representation

182

*/

182

*/

183

184

struct mem_cgroup_tree_per_zone {

184

struct mem_cgroup_tree_per_zone {

185

struct rb_root rb_root;

185

struct rb_root rb_root;

186

spinlock_t lock;

186

spinlock_t lock;

187

};

187

};

188

189

struct mem_cgroup_tree_per_node {

189

struct mem_cgroup_tree_per_node {

190

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

190

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

191

};

191

};

192

193

struct mem_cgroup_tree {

193

struct mem_cgroup_tree {

194

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

194

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

195

};

195

};

196

197

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

197

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

198

199

struct mem_cgroup_threshold {

199

struct mem_cgroup_threshold {

200

struct eventfd_ctx *eventfd;

200

struct eventfd_ctx *eventfd;

201

u64 threshold;

201

u64 threshold;

202

};

202

};

203

204

/* For threshold */

204

/* For threshold */

205

struct mem_cgroup_threshold_ary {

205

struct mem_cgroup_threshold_ary {

206

/* An array index points to threshold just below or equal to usage. */

206

/* An array index points to threshold just below or equal to usage. */

207

int current_threshold;

207

int current_threshold;

208

/* Size of entries[] */

208

/* Size of entries[] */

209

unsigned int size;

209

unsigned int size;

210

/* Array of thresholds */

210

/* Array of thresholds */

211

struct mem_cgroup_threshold entries[0];

211

struct mem_cgroup_threshold entries[0];

212

};

212

};

213

214

struct mem_cgroup_thresholds {

214

struct mem_cgroup_thresholds {

215

/* Primary thresholds array */

215

/* Primary thresholds array */

216

struct mem_cgroup_threshold_ary *primary;

216

struct mem_cgroup_threshold_ary *primary;

217

/*

217

/*

218

* Spare threshold array.

218

* Spare threshold array.

219

* This is needed to make mem_cgroup_unregister_event() "never fail".

219

* This is needed to make mem_cgroup_unregister_event() "never fail".

220

* It must be able to store at least primary->size - 1 entries.

220

* It must be able to store at least primary->size - 1 entries.

221

*/

221

*/

222

struct mem_cgroup_threshold_ary *spare;

222

struct mem_cgroup_threshold_ary *spare;

223

};

223

};

224

225

/* for OOM */

225

/* for OOM */

226

struct mem_cgroup_eventfd_list {

226

struct mem_cgroup_eventfd_list {

227

struct list_head list;

227

struct list_head list;

228

struct eventfd_ctx *eventfd;

228

struct eventfd_ctx *eventfd;

229

};

229

};

230

231

/*

231

/*

232

* cgroup_event represents events which userspace want to receive.

232

* cgroup_event represents events which userspace want to receive.

233

*/

233

*/

234

struct mem_cgroup_event {

234

struct mem_cgroup_event {

235

/*

235

/*

236

* memcg which the event belongs to.

236

* memcg which the event belongs to.

237

*/

237

*/

238

struct mem_cgroup *memcg;

238

struct mem_cgroup *memcg;

239

/*

239

/*

240

* eventfd to signal userspace about the event.

240

* eventfd to signal userspace about the event.

241

*/

241

*/

242

struct eventfd_ctx *eventfd;

242

struct eventfd_ctx *eventfd;

243

/*

243

/*

244

* Each of these stored in a list by the cgroup.

244

* Each of these stored in a list by the cgroup.

245

*/

245

*/

246

struct list_head list;

246

struct list_head list;

247

/*

247

/*

248

* register_event() callback will be used to add new userspace

248

* register_event() callback will be used to add new userspace

249

* waiter for changes related to this event. Use eventfd_signal()

249

* waiter for changes related to this event. Use eventfd_signal()

250

* on eventfd to send notification to userspace.

250

* on eventfd to send notification to userspace.

251

*/

251

*/

252

int (*register_event)(struct mem_cgroup *memcg,

252

int (*register_event)(struct mem_cgroup *memcg,

253

struct eventfd_ctx *eventfd, const char *args);

253

struct eventfd_ctx *eventfd, const char *args);

254

/*

254

/*

255

* unregister_event() callback will be called when userspace closes

255

* unregister_event() callback will be called when userspace closes

256

* the eventfd or on cgroup removing. This callback must be set,

256

* the eventfd or on cgroup removing. This callback must be set,

257

* if you want provide notification functionality.

257

* if you want provide notification functionality.

258

*/

258

*/

259

void (*unregister_event)(struct mem_cgroup *memcg,

259

void (*unregister_event)(struct mem_cgroup *memcg,

260

struct eventfd_ctx *eventfd);

260

struct eventfd_ctx *eventfd);

261

/*

261

/*

262

* All fields below needed to unregister event when

262

* All fields below needed to unregister event when

263

* userspace closes eventfd.

263

* userspace closes eventfd.

264

*/

264

*/

265

poll_table pt;

265

poll_table pt;

266

wait_queue_head_t *wqh;

266

wait_queue_head_t *wqh;

267

wait_queue_t wait;

267

wait_queue_t wait;

268

struct work_struct remove;

268

struct work_struct remove;

269

};

269

};

270

271

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

271

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

272

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

272

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

273

274

/*

274

/*

275

* The memory controller data structure. The memory controller controls both

275

* The memory controller data structure. The memory controller controls both

276

* page cache and RSS per cgroup. We would eventually like to provide

276

* page cache and RSS per cgroup. We would eventually like to provide

277

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

277

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

278

* to help the administrator determine what knobs to tune.

278

* to help the administrator determine what knobs to tune.

279

*

279

*

280

* TODO: Add a water mark for the memory controller. Reclaim will begin when

280

* TODO: Add a water mark for the memory controller. Reclaim will begin when

281

* we hit the water mark. May be even add a low water mark, such that

281

* we hit the water mark. May be even add a low water mark, such that

282

* no reclaim occurs from a cgroup at it's low water mark, this is

282

* no reclaim occurs from a cgroup at it's low water mark, this is

283

* a feature that will be implemented much later in the future.

283

* a feature that will be implemented much later in the future.

284

*/

284

*/

285

struct mem_cgroup {

285

struct mem_cgroup {

286

struct cgroup_subsys_state css;

286

struct cgroup_subsys_state css;

287

/*

287

/*

288

* the counter to account for memory usage

288

* the counter to account for memory usage

289

*/

289

*/

290

struct res_counter res;

290

struct res_counter res;

291

292

/* vmpressure notifications */

292

/* vmpressure notifications */

293

struct vmpressure vmpressure;

293

struct vmpressure vmpressure;

294

295

/*

295

/*

296

* the counter to account for mem+swap usage.

296

* the counter to account for mem+swap usage.

297

*/

297

*/

298

struct res_counter memsw;

298

struct res_counter memsw;

299

300

/*

300

/*

301

* the counter to account for kernel memory usage.

301

* the counter to account for kernel memory usage.

302

*/

302

*/

303

struct res_counter kmem;

303

struct res_counter kmem;

304

/*

304

/*

305

* Should the accounting and control be hierarchical, per subtree?

305

* Should the accounting and control be hierarchical, per subtree?

306

*/

306

*/

307

bool use_hierarchy;

307

bool use_hierarchy;

308

unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */

308

unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */

309

310

bool oom_lock;

310

bool oom_lock;

311

atomic_t under_oom;

311

atomic_t under_oom;

312

atomic_t oom_wakeups;

312

atomic_t oom_wakeups;

313

314

int swappiness;

314

int swappiness;

315

/* OOM-Killer disable */

315

/* OOM-Killer disable */

316

int oom_kill_disable;

316

int oom_kill_disable;

317

318

/* set when res.limit == memsw.limit */

318

/* set when res.limit == memsw.limit */

319

bool memsw_is_minimum;

319

bool memsw_is_minimum;

320

321

/* protect arrays of thresholds */

321

/* protect arrays of thresholds */

322

struct mutex thresholds_lock;

322

struct mutex thresholds_lock;

323

324

/* thresholds for memory usage. RCU-protected */

324

/* thresholds for memory usage. RCU-protected */

325

struct mem_cgroup_thresholds thresholds;

325

struct mem_cgroup_thresholds thresholds;

326

327

/* thresholds for mem+swap usage. RCU-protected */

327

/* thresholds for mem+swap usage. RCU-protected */

328

struct mem_cgroup_thresholds memsw_thresholds;

328

struct mem_cgroup_thresholds memsw_thresholds;

329

330

/* For oom notifier event fd */

330

/* For oom notifier event fd */

331

struct list_head oom_notify;

331

struct list_head oom_notify;

332

333

/*

333

/*

334

* Should we move charges of a task when a task is moved into this

334

* Should we move charges of a task when a task is moved into this

335

* mem_cgroup ? And what type of charges should we move ?

335

* mem_cgroup ? And what type of charges should we move ?

336

*/

336

*/

337

unsigned long move_charge_at_immigrate;

337

unsigned long move_charge_at_immigrate;

338

/*

338

/*

339

* set > 0 if pages under this cgroup are moving to other cgroup.

339

* set > 0 if pages under this cgroup are moving to other cgroup.

340

*/

340

*/

341

atomic_t moving_account;

341

atomic_t moving_account;

342

/* taken only while moving_account > 0 */

342

/* taken only while moving_account > 0 */

343

spinlock_t move_lock;

343

spinlock_t move_lock;

344

/*

344

/*

345

* percpu counter.

345

* percpu counter.

346

*/

346

*/

347

struct mem_cgroup_stat_cpu __percpu *stat;

347

struct mem_cgroup_stat_cpu __percpu *stat;

348

/*

348

/*

349

* used when a cpu is offlined or other synchronizations

349

* used when a cpu is offlined or other synchronizations

350

* See mem_cgroup_read_stat().

350

* See mem_cgroup_read_stat().

351

*/

351

*/

352

struct mem_cgroup_stat_cpu nocpu_base;

352

struct mem_cgroup_stat_cpu nocpu_base;

353

spinlock_t pcp_counter_lock;

353

spinlock_t pcp_counter_lock;

354

355

atomic_t dead_count;

355

atomic_t dead_count;

356

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)

356

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)

357

struct cg_proto tcp_mem;

357

struct cg_proto tcp_mem;

358

#endif

358

#endif

359

#if defined(CONFIG_MEMCG_KMEM)

359

#if defined(CONFIG_MEMCG_KMEM)

360

/* analogous to slab_common's slab_caches list. per-memcg */

360

/* analogous to slab_common's slab_caches list. per-memcg */

361

struct list_head memcg_slab_caches;

361

struct list_head memcg_slab_caches;

362

/* Not a spinlock, we can take a lot of time walking the list */

362

/* Not a spinlock, we can take a lot of time walking the list */

363

struct mutex slab_caches_mutex;

363

struct mutex slab_caches_mutex;

364

/* Index in the kmem_cache->memcg_params->memcg_caches array */

364

/* Index in the kmem_cache->memcg_params->memcg_caches array */

365

int kmemcg_id;

365

int kmemcg_id;

366

#endif

366

#endif

367

368

int last_scanned_node;

368

int last_scanned_node;

369

#if MAX_NUMNODES > 1

369

#if MAX_NUMNODES > 1

370

nodemask_t scan_nodes;

370

nodemask_t scan_nodes;

371

atomic_t numainfo_events;

371

atomic_t numainfo_events;

372

atomic_t numainfo_updating;

372

atomic_t numainfo_updating;

373

#endif

373

#endif

374

375

/* List of events which userspace want to receive */

375

/* List of events which userspace want to receive */

376

struct list_head event_list;

376

struct list_head event_list;

377

spinlock_t event_list_lock;

377

spinlock_t event_list_lock;

378

379

struct mem_cgroup_per_node *nodeinfo[0];

379

struct mem_cgroup_per_node *nodeinfo[0];

380

/* WARNING: nodeinfo must be the last member here */

380

/* WARNING: nodeinfo must be the last member here */

381

};

381

};

382

383

/* internal only representation about the status of kmem accounting. */

383

/* internal only representation about the status of kmem accounting. */

384

enum {

384

enum {

385

KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */

385

KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */

386

KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */

386

KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */

387

};

387

};

388

389

#ifdef CONFIG_MEMCG_KMEM

389

#ifdef CONFIG_MEMCG_KMEM

390

static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)

390

static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)

391

{

391

{

392

set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

392

set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

393

}

393

}

394

395

static bool memcg_kmem_is_active(struct mem_cgroup *memcg)

395

static bool memcg_kmem_is_active(struct mem_cgroup *memcg)

396

{

396

{

397

return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

397

return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

398

}

398

}

399

400

static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)

400

static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)

401

{

401

{

402

/*

402

/*

403

* Our caller must use css_get() first, because memcg_uncharge_kmem()

403

* Our caller must use css_get() first, because memcg_uncharge_kmem()

404

* will call css_put() if it sees the memcg is dead.

404

* will call css_put() if it sees the memcg is dead.

405

*/

405

*/

406

smp_wmb();

406

smp_wmb();

407

if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))

407

if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))

408

set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);

408

set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);

409

}

409

}

410

411

static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)

411

static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)

412

{

412

{

413

return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,

413

return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,

414

&memcg->kmem_account_flags);

414

&memcg->kmem_account_flags);

415

}

415

}

416

#endif

416

#endif

417

418

/* Stuffs for move charges at task migration. */

418

/* Stuffs for move charges at task migration. */

419

/*

419

/*

420

* Types of charges to be moved. "move_charge_at_immitgrate" and

420

* Types of charges to be moved. "move_charge_at_immitgrate" and

421

* "immigrate_flags" are treated as a left-shifted bitmap of these types.

421

* "immigrate_flags" are treated as a left-shifted bitmap of these types.

422

*/

422

*/

423

enum move_type {

423

enum move_type {

424

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

424

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

425

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

425

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

426

NR_MOVE_TYPE,

426

NR_MOVE_TYPE,

427

};

427

};

428

429

/* "mc" and its members are protected by cgroup_mutex */

429

/* "mc" and its members are protected by cgroup_mutex */

430

static struct move_charge_struct {

430

static struct move_charge_struct {

431

spinlock_t lock; /* for from, to */

431

spinlock_t lock; /* for from, to */

432

struct mem_cgroup *from;

432

struct mem_cgroup *from;

433

struct mem_cgroup *to;

433

struct mem_cgroup *to;

434

unsigned long immigrate_flags;

434

unsigned long immigrate_flags;

435

unsigned long precharge;

435

unsigned long precharge;

436

unsigned long moved_charge;

436

unsigned long moved_charge;

437

unsigned long moved_swap;

437

unsigned long moved_swap;

438

struct task_struct *moving_task; /* a task moving charges */

438

struct task_struct *moving_task; /* a task moving charges */

439

wait_queue_head_t waitq; /* a waitq for other context */

439

wait_queue_head_t waitq; /* a waitq for other context */

440

} mc = {

440

} mc = {

441

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

441

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

442

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

442

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

443

};

443

};

444

445

static bool move_anon(void)

445

static bool move_anon(void)

446

{

446

{

447

return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);

447

return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);

448

}

448

}

449

450

static bool move_file(void)

450

static bool move_file(void)

451

{

451

{

452

return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);

452

return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);

453

}

453

}

454

455

/*

455

/*

456

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

456

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

457

* limit reclaim to prevent infinite loops, if they ever occur.

457

* limit reclaim to prevent infinite loops, if they ever occur.

458

*/

458

*/

459

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

459

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

460

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

460

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

461

462

enum charge_type {

462

enum charge_type {

463

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

463

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

464

MEM_CGROUP_CHARGE_TYPE_ANON,

464

MEM_CGROUP_CHARGE_TYPE_ANON,

465

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

465

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

466

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

466

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

467

NR_CHARGE_TYPE,

467

NR_CHARGE_TYPE,

468

};

468

};

469

470

/* for encoding cft->private value on file */

470

/* for encoding cft->private value on file */

471

enum res_type {

471

enum res_type {

472

_MEM,

472

_MEM,

473

_MEMSWAP,

473

_MEMSWAP,

474

_OOM_TYPE,

474

_OOM_TYPE,

475

_KMEM,

475

_KMEM,

476

};

476

};

477

478

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

478

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

479

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

479

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

480

#define MEMFILE_ATTR(val) ((val) & 0xffff)

480

#define MEMFILE_ATTR(val) ((val) & 0xffff)

481

/* Used for OOM nofiier */

481

/* Used for OOM nofiier */

482

#define OOM_CONTROL (0)

482

#define OOM_CONTROL (0)

483

484

/*

484

/*

485

* Reclaim flags for mem_cgroup_hierarchical_reclaim

485

* Reclaim flags for mem_cgroup_hierarchical_reclaim

486

*/

486

*/

487

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

487

#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0

488

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

488

#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)

489

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

489

#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1

490

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

490

#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)

491

492

/*

492

/*

493

* The memcg_create_mutex will be held whenever a new cgroup is created.

493

* The memcg_create_mutex will be held whenever a new cgroup is created.

494

* As a consequence, any change that needs to protect against new child cgroups

494

* As a consequence, any change that needs to protect against new child cgroups

495

* appearing has to hold it as well.

495

* appearing has to hold it as well.

496

*/

496

*/

497

static DEFINE_MUTEX(memcg_create_mutex);

497

static DEFINE_MUTEX(memcg_create_mutex);

498

499

struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)

499

struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)

500

{

500

{

501

return s ? container_of(s, struct mem_cgroup, css) : NULL;

501

return s ? container_of(s, struct mem_cgroup, css) : NULL;

502

}

502

}

503

504

/* Some nice accessors for the vmpressure. */

504

/* Some nice accessors for the vmpressure. */

505

struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)

505

struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)

506

{

506

{

507

if (!memcg)

507

if (!memcg)

508

memcg = root_mem_cgroup;

508

memcg = root_mem_cgroup;

509

return &memcg->vmpressure;

509

return &memcg->vmpressure;

510

}

510

}

511

512

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)

512

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)

513

{

513

{

514

return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;

514

return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;

515

}

515

}

516

517

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

517

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

518

{

518

{

519

return (memcg == root_mem_cgroup);

519

return (memcg == root_mem_cgroup);

520

}

520

}

521

522

/*

522

/*

523

* We restrict the id in the range of [1, 65535], so it can fit into

523

* We restrict the id in the range of [1, 65535], so it can fit into

524

* an unsigned short.

524

* an unsigned short.

525

*/

525

*/

526

#define MEM_CGROUP_ID_MAX USHRT_MAX

526

#define MEM_CGROUP_ID_MAX USHRT_MAX

527

528

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)

528

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)

529

{

529

{

530

/*

530

/*

531

* The ID of the root cgroup is 0, but memcg treat 0 as an

531

* The ID of the root cgroup is 0, but memcg treat 0 as an

532

* invalid ID, so we return (cgroup_id + 1).

532

* invalid ID, so we return (cgroup_id + 1).

533

*/

533

*/

534

return memcg->css.cgroup->id + 1;

534

return memcg->css.cgroup->id + 1;

535

}

535

}

536

537

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)

537

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)

538

{

538

{

539

struct cgroup_subsys_state *css;

539

struct cgroup_subsys_state *css;

540

541

css = css_from_id(id - 1, &memory_cgrp_subsys);

541

css = css_from_id(id - 1, &memory_cgrp_subsys);

542

return mem_cgroup_from_css(css);

542

return mem_cgroup_from_css(css);

543

}

543

}

544

545

/* Writing them here to avoid exposing memcg's inner layout */

545

/* Writing them here to avoid exposing memcg's inner layout */

546

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

546

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

547

548

void sock_update_memcg(struct sock *sk)

548

void sock_update_memcg(struct sock *sk)

549

{

549

{

550

if (mem_cgroup_sockets_enabled) {

550

if (mem_cgroup_sockets_enabled) {

551

struct mem_cgroup *memcg;

551

struct mem_cgroup *memcg;

552

struct cg_proto *cg_proto;

552

struct cg_proto *cg_proto;

553

554

BUG_ON(!sk->sk_prot->proto_cgroup);

554

BUG_ON(!sk->sk_prot->proto_cgroup);

555

556

/* Socket cloning can throw us here with sk_cgrp already

556

/* Socket cloning can throw us here with sk_cgrp already

557

* filled. It won't however, necessarily happen from

557

* filled. It won't however, necessarily happen from

558

* process context. So the test for root memcg given

558

* process context. So the test for root memcg given

559

* the current task's memcg won't help us in this case.

559

* the current task's memcg won't help us in this case.

560

*

560

*

561

* Respecting the original socket's memcg is a better

561

* Respecting the original socket's memcg is a better

562

* decision in this case.

562

* decision in this case.

563

*/

563

*/

564

if (sk->sk_cgrp) {

564

if (sk->sk_cgrp) {

565

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

565

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

566

css_get(&sk->sk_cgrp->memcg->css);

566

css_get(&sk->sk_cgrp->memcg->css);

567

return;

567

return;

568

}

568

}

569

570

rcu_read_lock();

570

rcu_read_lock();

571

memcg = mem_cgroup_from_task(current);

571

memcg = mem_cgroup_from_task(current);

572

cg_proto = sk->sk_prot->proto_cgroup(memcg);

572

cg_proto = sk->sk_prot->proto_cgroup(memcg);

573

if (!mem_cgroup_is_root(memcg) &&

573

if (!mem_cgroup_is_root(memcg) &&

574

memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {

574

memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {

575

sk->sk_cgrp = cg_proto;

575

sk->sk_cgrp = cg_proto;

576

}

576

}

577

rcu_read_unlock();

577

rcu_read_unlock();

578

}

578

}

579

}

579

}

580

EXPORT_SYMBOL(sock_update_memcg);

580

EXPORT_SYMBOL(sock_update_memcg);

581

582

void sock_release_memcg(struct sock *sk)

582

void sock_release_memcg(struct sock *sk)

583

{

583

{

584

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

584

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

585

struct mem_cgroup *memcg;

585

struct mem_cgroup *memcg;

586

WARN_ON(!sk->sk_cgrp->memcg);

586

WARN_ON(!sk->sk_cgrp->memcg);

587

memcg = sk->sk_cgrp->memcg;

587

memcg = sk->sk_cgrp->memcg;

588

css_put(&sk->sk_cgrp->memcg->css);

588

css_put(&sk->sk_cgrp->memcg->css);

589

}

589

}

590

}

590

}

591

592

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

592

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

593

{

593

{

594

if (!memcg || mem_cgroup_is_root(memcg))

594

if (!memcg || mem_cgroup_is_root(memcg))

595

return NULL;

595

return NULL;

596

597

return &memcg->tcp_mem;

597

return &memcg->tcp_mem;

598

}

598

}

599

EXPORT_SYMBOL(tcp_proto_cgroup);

599

EXPORT_SYMBOL(tcp_proto_cgroup);

600

601

static void disarm_sock_keys(struct mem_cgroup *memcg)

601

static void disarm_sock_keys(struct mem_cgroup *memcg)

602

{

602

{

603

if (!memcg_proto_activated(&memcg->tcp_mem))

603

if (!memcg_proto_activated(&memcg->tcp_mem))

604

return;

604

return;

605

static_key_slow_dec(&memcg_socket_limit_enabled);

605

static_key_slow_dec(&memcg_socket_limit_enabled);

606

}

606

}

607

#else

607

#else

608

static void disarm_sock_keys(struct mem_cgroup *memcg)

608

static void disarm_sock_keys(struct mem_cgroup *memcg)

609

{

609

{

610

}

610

}

611

#endif

611

#endif

612

613

#ifdef CONFIG_MEMCG_KMEM

613

#ifdef CONFIG_MEMCG_KMEM

614

/*

614

/*

615

* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.

615

* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.

616

* The main reason for not using cgroup id for this:

616

* The main reason for not using cgroup id for this:

617

* this works better in sparse environments, where we have a lot of memcgs,

617

* this works better in sparse environments, where we have a lot of memcgs,

618

* but only a few kmem-limited. Or also, if we have, for instance, 200

618

* but only a few kmem-limited. Or also, if we have, for instance, 200

619

* memcgs, and none but the 200th is kmem-limited, we'd have to have a

619

* memcgs, and none but the 200th is kmem-limited, we'd have to have a

620

* 200 entry array for that.

620

* 200 entry array for that.

621

*

621

*

622

* The current size of the caches array is stored in

622

* The current size of the caches array is stored in

623

* memcg_limited_groups_array_size. It will double each time we have to

623

* memcg_limited_groups_array_size. It will double each time we have to

624

* increase it.

624

* increase it.

625

*/

625

*/

626

static DEFINE_IDA(kmem_limited_groups);

626

static DEFINE_IDA(kmem_limited_groups);

627

int memcg_limited_groups_array_size;

627

int memcg_limited_groups_array_size;

628

629

/*

629

/*

630

* MIN_SIZE is different than 1, because we would like to avoid going through

630

* MIN_SIZE is different than 1, because we would like to avoid going through

631

* the alloc/free process all the time. In a small machine, 4 kmem-limited

631

* the alloc/free process all the time. In a small machine, 4 kmem-limited

632

* cgroups is a reasonable guess. In the future, it could be a parameter or

632

* cgroups is a reasonable guess. In the future, it could be a parameter or

633

* tunable, but that is strictly not necessary.

633

* tunable, but that is strictly not necessary.

634

*

634

*

635

* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get

635

* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get

636

* this constant directly from cgroup, but it is understandable that this is

636

* this constant directly from cgroup, but it is understandable that this is

637

* better kept as an internal representation in cgroup.c. In any case, the

637

* better kept as an internal representation in cgroup.c. In any case, the

638

* cgrp_id space is not getting any smaller, and we don't have to necessarily

638

* cgrp_id space is not getting any smaller, and we don't have to necessarily

639

* increase ours as well if it increases.

639

* increase ours as well if it increases.

640

*/

640

*/

641

#define MEMCG_CACHES_MIN_SIZE 4

641

#define MEMCG_CACHES_MIN_SIZE 4

642

#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX

642

#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX

643

644

/*

644

/*

645

* A lot of the calls to the cache allocation functions are expected to be

645

* A lot of the calls to the cache allocation functions are expected to be

646

* inlined by the compiler. Since the calls to memcg_kmem_get_cache are

646

* inlined by the compiler. Since the calls to memcg_kmem_get_cache are

647

* conditional to this static branch, we'll have to allow modules that does

647

* conditional to this static branch, we'll have to allow modules that does

648

* kmem_cache_alloc and the such to see this symbol as well

648

* kmem_cache_alloc and the such to see this symbol as well

649

*/

649

*/

650

struct static_key memcg_kmem_enabled_key;

650

struct static_key memcg_kmem_enabled_key;

651

EXPORT_SYMBOL(memcg_kmem_enabled_key);

651

EXPORT_SYMBOL(memcg_kmem_enabled_key);

652

653

static void disarm_kmem_keys(struct mem_cgroup *memcg)

653

static void disarm_kmem_keys(struct mem_cgroup *memcg)

654

{

654

{

655

if (memcg_kmem_is_active(memcg)) {

655

if (memcg_kmem_is_active(memcg)) {

656

static_key_slow_dec(&memcg_kmem_enabled_key);

656

static_key_slow_dec(&memcg_kmem_enabled_key);

657

ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);

657

ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);

658

}

658

}

659

/*

659

/*

660

* This check can't live in kmem destruction function,

660

* This check can't live in kmem destruction function,

661

* since the charges will outlive the cgroup

661

* since the charges will outlive the cgroup

662

*/

662

*/

663

WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);

663

WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);

664

}

664

}

665

#else

665

#else

666

static void disarm_kmem_keys(struct mem_cgroup *memcg)

666

static void disarm_kmem_keys(struct mem_cgroup *memcg)

667

{

667

{

668

}

668

}

669

#endif /* CONFIG_MEMCG_KMEM */

669

#endif /* CONFIG_MEMCG_KMEM */

670

671

static void disarm_static_keys(struct mem_cgroup *memcg)

671

static void disarm_static_keys(struct mem_cgroup *memcg)

672

{

672

{

673

disarm_sock_keys(memcg);

673

disarm_sock_keys(memcg);

674

disarm_kmem_keys(memcg);

674

disarm_kmem_keys(memcg);

675

}

675

}

676

677

static void drain_all_stock_async(struct mem_cgroup *memcg);

677

static void drain_all_stock_async(struct mem_cgroup *memcg);

678

679

static struct mem_cgroup_per_zone *

679

static struct mem_cgroup_per_zone *

680

mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)

680

mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)

681

{

681

{

682

VM_BUG_ON((unsigned)nid >= nr_node_ids);

682

VM_BUG_ON((unsigned)nid >= nr_node_ids);

683

return &memcg->nodeinfo[nid]->zoneinfo[zid];

683

return &memcg->nodeinfo[nid]->zoneinfo[zid];

684

}

684

}

685

686

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

686

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

687

{

687

{

688

return &memcg->css;

688

return &memcg->css;

689

}

689

}

690

691

static struct mem_cgroup_per_zone *

691

static struct mem_cgroup_per_zone *

692

page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)

692

page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)

693

{

693

{

694

int nid = page_to_nid(page);

694

int nid = page_to_nid(page);

695

int zid = page_zonenum(page);

695

int zid = page_zonenum(page);

696

697

return mem_cgroup_zoneinfo(memcg, nid, zid);

697

return mem_cgroup_zoneinfo(memcg, nid, zid);

698

}

698

}

699

700

static struct mem_cgroup_tree_per_zone *

700

static struct mem_cgroup_tree_per_zone *

701

soft_limit_tree_node_zone(int nid, int zid)

701

soft_limit_tree_node_zone(int nid, int zid)

702

{

702

{

703

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

703

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

704

}

704

}

705

706

static struct mem_cgroup_tree_per_zone *

706

static struct mem_cgroup_tree_per_zone *

707

soft_limit_tree_from_page(struct page *page)

707

soft_limit_tree_from_page(struct page *page)

708

{

708

{

709

int nid = page_to_nid(page);

709

int nid = page_to_nid(page);

710

int zid = page_zonenum(page);

710

int zid = page_zonenum(page);

711

712

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

712

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

713

}

713

}

714

715

static void

715

static void

716

__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,

716

__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,

717

struct mem_cgroup_per_zone *mz,

717

struct mem_cgroup_per_zone *mz,

718

struct mem_cgroup_tree_per_zone *mctz,

718

struct mem_cgroup_tree_per_zone *mctz,

719

unsigned long long new_usage_in_excess)

719

unsigned long long new_usage_in_excess)

720

{

720

{

721

struct rb_node **p = &mctz->rb_root.rb_node;

721

struct rb_node **p = &mctz->rb_root.rb_node;

722

struct rb_node *parent = NULL;

722

struct rb_node *parent = NULL;

723

struct mem_cgroup_per_zone *mz_node;

723

struct mem_cgroup_per_zone *mz_node;

724

725

if (mz->on_tree)

725

if (mz->on_tree)

726

return;

726

return;

727

728

mz->usage_in_excess = new_usage_in_excess;

728

mz->usage_in_excess = new_usage_in_excess;

729

if (!mz->usage_in_excess)

729

if (!mz->usage_in_excess)

730

return;

730

return;

731

while (*p) {

731

while (*p) {

732

parent = *p;

732

parent = *p;

733

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

733

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

734

tree_node);

734

tree_node);

735

if (mz->usage_in_excess < mz_node->usage_in_excess)

735

if (mz->usage_in_excess < mz_node->usage_in_excess)

736

p = &(*p)->rb_left;

736

p = &(*p)->rb_left;

737

/*

737

/*

738

* We can't avoid mem cgroups that are over their soft

738

* We can't avoid mem cgroups that are over their soft

739

* limit by the same amount

739

* limit by the same amount

740

*/

740

*/

741

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

741

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

742

p = &(*p)->rb_right;

742

p = &(*p)->rb_right;

743

}

743

}

744

rb_link_node(&mz->tree_node, parent, p);

744

rb_link_node(&mz->tree_node, parent, p);

745

rb_insert_color(&mz->tree_node, &mctz->rb_root);

745

rb_insert_color(&mz->tree_node, &mctz->rb_root);

746

mz->on_tree = true;

746

mz->on_tree = true;

747

}

747

}

748

749

static void

749

static void

750

__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

750

__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

751

struct mem_cgroup_per_zone *mz,

751

struct mem_cgroup_per_zone *mz,

752

struct mem_cgroup_tree_per_zone *mctz)

752

struct mem_cgroup_tree_per_zone *mctz)

753

{

753

{

754

if (!mz->on_tree)

754

if (!mz->on_tree)

755

return;

755

return;

756

rb_erase(&mz->tree_node, &mctz->rb_root);

756

rb_erase(&mz->tree_node, &mctz->rb_root);

757

mz->on_tree = false;

757

mz->on_tree = false;

758

}

758

}

759

760

static void

760

static void

761

mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

761

mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,

762

struct mem_cgroup_per_zone *mz,

762

struct mem_cgroup_per_zone *mz,

763

struct mem_cgroup_tree_per_zone *mctz)

763

struct mem_cgroup_tree_per_zone *mctz)

764

{

764

{

765

spin_lock(&mctz->lock);

765

spin_lock(&mctz->lock);

766

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

766

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

767

spin_unlock(&mctz->lock);

767

spin_unlock(&mctz->lock);

768

}

768

}

769

770

771

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

771

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

772

{

772

{

773

unsigned long long excess;

773

unsigned long long excess;

774

struct mem_cgroup_per_zone *mz;

774

struct mem_cgroup_per_zone *mz;

775

struct mem_cgroup_tree_per_zone *mctz;

775

struct mem_cgroup_tree_per_zone *mctz;

776

int nid = page_to_nid(page);

776

int nid = page_to_nid(page);

777

int zid = page_zonenum(page);

777

int zid = page_zonenum(page);

778

mctz = soft_limit_tree_from_page(page);

778

mctz = soft_limit_tree_from_page(page);

779

780

/*

780

/*

781

* Necessary to update all ancestors when hierarchy is used.

781

* Necessary to update all ancestors when hierarchy is used.

782

* because their event counter is not touched.

782

* because their event counter is not touched.

783

*/

783

*/

784

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

784

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

785

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

785

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

786

excess = res_counter_soft_limit_excess(&memcg->res);

786

excess = res_counter_soft_limit_excess(&memcg->res);

787

/*

787

/*

788

* We have to update the tree if mz is on RB-tree or

788

* We have to update the tree if mz is on RB-tree or

789

* mem is over its softlimit.

789

* mem is over its softlimit.

790

*/

790

*/

791

if (excess || mz->on_tree) {

791

if (excess || mz->on_tree) {

792

spin_lock(&mctz->lock);

792

spin_lock(&mctz->lock);

793

/* if on-tree, remove it */

793

/* if on-tree, remove it */

794

if (mz->on_tree)

794

if (mz->on_tree)

795

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

795

__mem_cgroup_remove_exceeded(memcg, mz, mctz);

796

/*

796

/*

797

* Insert again. mz->usage_in_excess will be updated.

797

* Insert again. mz->usage_in_excess will be updated.

798

* If excess is 0, no tree ops.

798

* If excess is 0, no tree ops.

799

*/

799

*/

800

__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);

800

__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);

801

spin_unlock(&mctz->lock);

801

spin_unlock(&mctz->lock);

802

}

802

}

803

}

803

}

804

}

804

}

805

806

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

806

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

807

{

807

{

808

int node, zone;

808

int node, zone;

809

struct mem_cgroup_per_zone *mz;

809

struct mem_cgroup_per_zone *mz;

810

struct mem_cgroup_tree_per_zone *mctz;

810

struct mem_cgroup_tree_per_zone *mctz;

811

812

for_each_node(node) {

812

for_each_node(node) {

813

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

813

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

814

mz = mem_cgroup_zoneinfo(memcg, node, zone);

814

mz = mem_cgroup_zoneinfo(memcg, node, zone);

815

mctz = soft_limit_tree_node_zone(node, zone);

815

mctz = soft_limit_tree_node_zone(node, zone);

816

mem_cgroup_remove_exceeded(memcg, mz, mctz);

816

mem_cgroup_remove_exceeded(memcg, mz, mctz);

817

}

817

}

818

}

818

}

819

}

819

}

820

821

static struct mem_cgroup_per_zone *

821

static struct mem_cgroup_per_zone *

822

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

822

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

823

{

823

{

824

struct rb_node *rightmost = NULL;

824

struct rb_node *rightmost = NULL;

825

struct mem_cgroup_per_zone *mz;

825

struct mem_cgroup_per_zone *mz;

826

827

retry:

827

retry:

828

mz = NULL;

828

mz = NULL;

829

rightmost = rb_last(&mctz->rb_root);

829

rightmost = rb_last(&mctz->rb_root);

830

if (!rightmost)

830

if (!rightmost)

831

goto done; /* Nothing to reclaim from */

831

goto done; /* Nothing to reclaim from */

832

833

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

833

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

834

/*

834

/*

835

* Remove the node now but someone else can add it back,

835

* Remove the node now but someone else can add it back,

836

* we will to add it back at the end of reclaim to its correct

836

* we will to add it back at the end of reclaim to its correct

837

* position in the tree.

837

* position in the tree.

838

*/

838

*/

839

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

839

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

840

if (!res_counter_soft_limit_excess(&mz->memcg->res) ||

840

if (!res_counter_soft_limit_excess(&mz->memcg->res) ||

841

!css_tryget(&mz->memcg->css))

841

!css_tryget(&mz->memcg->css))

842

goto retry;

842

goto retry;

843

done:

843

done:

844

return mz;

844

return mz;

845

}

845

}

846

847

static struct mem_cgroup_per_zone *

847

static struct mem_cgroup_per_zone *

848

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

848

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

849

{

849

{

850

struct mem_cgroup_per_zone *mz;

850

struct mem_cgroup_per_zone *mz;

851

852

spin_lock(&mctz->lock);

852

spin_lock(&mctz->lock);

853

mz = __mem_cgroup_largest_soft_limit_node(mctz);

853

mz = __mem_cgroup_largest_soft_limit_node(mctz);

854

spin_unlock(&mctz->lock);

854

spin_unlock(&mctz->lock);

855

return mz;

855

return mz;

856

}

856

}

857

858

/*

858

/*

859

* Implementation Note: reading percpu statistics for memcg.

859

* Implementation Note: reading percpu statistics for memcg.

860

*

860

*

861

* Both of vmstat[] and percpu_counter has threshold and do periodic

861

* Both of vmstat[] and percpu_counter has threshold and do periodic

862

* synchronization to implement "quick" read. There are trade-off between

862

* synchronization to implement "quick" read. There are trade-off between

863

* reading cost and precision of value. Then, we may have a chance to implement

863

* reading cost and precision of value. Then, we may have a chance to implement

864

* a periodic synchronizion of counter in memcg's counter.

864

* a periodic synchronizion of counter in memcg's counter.

865

*

865

*

866

* But this _read() function is used for user interface now. The user accounts

866

* But this _read() function is used for user interface now. The user accounts

867

* memory usage by memory cgroup and he _always_ requires exact value because

867

* memory usage by memory cgroup and he _always_ requires exact value because

868

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

868

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

869

* have to visit all online cpus and make sum. So, for now, unnecessary

869

* have to visit all online cpus and make sum. So, for now, unnecessary

870

* synchronization is not implemented. (just implemented for cpu hotplug)

870

* synchronization is not implemented. (just implemented for cpu hotplug)

871

*

871

*

872

* If there are kernel internal actions which can make use of some not-exact

872

* If there are kernel internal actions which can make use of some not-exact

873

* value, and reading all cpu value can be performance bottleneck in some

873

* value, and reading all cpu value can be performance bottleneck in some

874

* common workload, threashold and synchonization as vmstat[] should be

874

* common workload, threashold and synchonization as vmstat[] should be

875

* implemented.

875

* implemented.

876

*/

876

*/

877

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

877

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

878

enum mem_cgroup_stat_index idx)

878

enum mem_cgroup_stat_index idx)

879

{

879

{

880

long val = 0;

880

long val = 0;

881

int cpu;

881

int cpu;

882

883

get_online_cpus();

883

get_online_cpus();

884

for_each_online_cpu(cpu)

884

for_each_online_cpu(cpu)

885

val += per_cpu(memcg->stat->count[idx], cpu);

885

val += per_cpu(memcg->stat->count[idx], cpu);

886

#ifdef CONFIG_HOTPLUG_CPU

886

#ifdef CONFIG_HOTPLUG_CPU

887

spin_lock(&memcg->pcp_counter_lock);

887

spin_lock(&memcg->pcp_counter_lock);

888

val += memcg->nocpu_base.count[idx];

888

val += memcg->nocpu_base.count[idx];

889

spin_unlock(&memcg->pcp_counter_lock);

889

spin_unlock(&memcg->pcp_counter_lock);

890

#endif

890

#endif

891

put_online_cpus();

891

put_online_cpus();

892

return val;

892

return val;

893

}

893

}

894

895

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

895

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

896

bool charge)

896

bool charge)

897

{

897

{

898

int val = (charge) ? 1 : -1;

898

int val = (charge) ? 1 : -1;

899

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

899

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

900

}

900

}

901

902

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

902

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

903

enum mem_cgroup_events_index idx)

903

enum mem_cgroup_events_index idx)

904

{

904

{

905

unsigned long val = 0;

905

unsigned long val = 0;

906

int cpu;

906

int cpu;

907

908

get_online_cpus();

908

get_online_cpus();

909

for_each_online_cpu(cpu)

909

for_each_online_cpu(cpu)

910

val += per_cpu(memcg->stat->events[idx], cpu);

910

val += per_cpu(memcg->stat->events[idx], cpu);

911

#ifdef CONFIG_HOTPLUG_CPU

911

#ifdef CONFIG_HOTPLUG_CPU

912

spin_lock(&memcg->pcp_counter_lock);

912

spin_lock(&memcg->pcp_counter_lock);

913

val += memcg->nocpu_base.events[idx];

913

val += memcg->nocpu_base.events[idx];

914

spin_unlock(&memcg->pcp_counter_lock);

914

spin_unlock(&memcg->pcp_counter_lock);

915

#endif

915

#endif

916

put_online_cpus();

916

put_online_cpus();

917

return val;

917

return val;

918

}

918

}

919

920

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

920

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

921

struct page *page,

921

struct page *page,

922

bool anon, int nr_pages)

922

bool anon, int nr_pages)

923

{

923

{

924

/*

924

/*

925

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

925

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

926

* counted as CACHE even if it's on ANON LRU.

926

* counted as CACHE even if it's on ANON LRU.

927

*/

927

*/

928

if (anon)

928

if (anon)

929

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

929

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

930

nr_pages);

930

nr_pages);

931

else

931

else

932

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

932

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

933

nr_pages);

933

nr_pages);

934

935

if (PageTransHuge(page))

935

if (PageTransHuge(page))

936

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

936

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

937

nr_pages);

937

nr_pages);

938

939

/* pagein of a big page is an event. So, ignore page size */

939

/* pagein of a big page is an event. So, ignore page size */

940

if (nr_pages > 0)

940

if (nr_pages > 0)

941

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

941

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

942

else {

942

else {

943

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

943

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

944

nr_pages = -nr_pages; /* for event */

944

nr_pages = -nr_pages; /* for event */

945

}

945

}

946

947

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

947

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

948

}

948

}

949

950

unsigned long

950

unsigned long

951

mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

951

mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

952

{

952

{

953

struct mem_cgroup_per_zone *mz;

953

struct mem_cgroup_per_zone *mz;

954

955

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

955

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

956

return mz->lru_size[lru];

956

return mz->lru_size[lru];

957

}

957

}

958

959

static unsigned long

959

static unsigned long

960

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,

960

mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,

961

unsigned int lru_mask)

961

unsigned int lru_mask)

962

{

962

{

963

struct mem_cgroup_per_zone *mz;

963

struct mem_cgroup_per_zone *mz;

964

enum lru_list lru;

964

enum lru_list lru;

965

unsigned long ret = 0;

965

unsigned long ret = 0;

966

967

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

967

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

968

969

for_each_lru(lru) {

969

for_each_lru(lru) {

970

if (BIT(lru) & lru_mask)

970

if (BIT(lru) & lru_mask)

971

ret += mz->lru_size[lru];

971

ret += mz->lru_size[lru];

972

}

972

}

973

return ret;

973

return ret;

974

}

974

}

975

976

static unsigned long

976

static unsigned long

977

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

977

mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

978

int nid, unsigned int lru_mask)

978

int nid, unsigned int lru_mask)

979

{

979

{

980

u64 total = 0;

980

u64 total = 0;

981

int zid;

981

int zid;

982

983

for (zid = 0; zid < MAX_NR_ZONES; zid++)

983

for (zid = 0; zid < MAX_NR_ZONES; zid++)

984

total += mem_cgroup_zone_nr_lru_pages(memcg,

984

total += mem_cgroup_zone_nr_lru_pages(memcg,

985

nid, zid, lru_mask);

985

nid, zid, lru_mask);

986

987

return total;

987

return total;

988

}

988

}

989

990

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

990

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

991

unsigned int lru_mask)

991

unsigned int lru_mask)

992

{

992

{

993

int nid;

993

int nid;

994

u64 total = 0;

994

u64 total = 0;

995

996

for_each_node_state(nid, N_MEMORY)

996

for_each_node_state(nid, N_MEMORY)

997

total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

997

total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

998

return total;

998

return total;

999

}

999

}

1000

1001

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

1001

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

1002

enum mem_cgroup_events_target target)

1002

enum mem_cgroup_events_target target)

1003

{

1003

{

1004

unsigned long val, next;

1004

unsigned long val, next;

1005

1006

val = __this_cpu_read(memcg->stat->nr_page_events);

1006

val = __this_cpu_read(memcg->stat->nr_page_events);

1007

next = __this_cpu_read(memcg->stat->targets[target]);

1007

next = __this_cpu_read(memcg->stat->targets[target]);

1008

/* from time_after() in jiffies.h */

1008

/* from time_after() in jiffies.h */

1009

if ((long)next - (long)val < 0) {

1009

if ((long)next - (long)val < 0) {

1010

switch (target) {

1010

switch (target) {

1011

case MEM_CGROUP_TARGET_THRESH:

1011

case MEM_CGROUP_TARGET_THRESH:

1012

next = val + THRESHOLDS_EVENTS_TARGET;

1012

next = val + THRESHOLDS_EVENTS_TARGET;

1013

break;

1013

break;

1014

case MEM_CGROUP_TARGET_SOFTLIMIT:

1014

case MEM_CGROUP_TARGET_SOFTLIMIT:

1015

next = val + SOFTLIMIT_EVENTS_TARGET;

1015

next = val + SOFTLIMIT_EVENTS_TARGET;

1016

break;

1016

break;

1017

case MEM_CGROUP_TARGET_NUMAINFO:

1017

case MEM_CGROUP_TARGET_NUMAINFO:

1018

next = val + NUMAINFO_EVENTS_TARGET;

1018

next = val + NUMAINFO_EVENTS_TARGET;

1019

break;

1019

break;

1020

default:

1020

default:

1021

break;

1021

break;

1022

}

1022

}

1023

__this_cpu_write(memcg->stat->targets[target], next);

1023

__this_cpu_write(memcg->stat->targets[target], next);

1024

return true;

1024

return true;

1025

}

1025

}

1026

return false;

1026

return false;

1027

}

1027

}

1028

1029

/*

1029

/*

1030

* Check events in order.

1030

* Check events in order.

1031

*

1031

*

1032

*/

1032

*/

1033

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

1033

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

1034

{

1034

{

1035

preempt_disable();

1035

preempt_disable();

1036

/* threshold event is triggered in finer grain than soft limit */

1036

/* threshold event is triggered in finer grain than soft limit */

1037

if (unlikely(mem_cgroup_event_ratelimit(memcg,

1037

if (unlikely(mem_cgroup_event_ratelimit(memcg,

1038

MEM_CGROUP_TARGET_THRESH))) {

1038

MEM_CGROUP_TARGET_THRESH))) {

1039

bool do_softlimit;

1039

bool do_softlimit;

1040

bool do_numainfo __maybe_unused;

1040

bool do_numainfo __maybe_unused;

1041

1042

do_softlimit = mem_cgroup_event_ratelimit(memcg,

1042

do_softlimit = mem_cgroup_event_ratelimit(memcg,

1043

MEM_CGROUP_TARGET_SOFTLIMIT);

1043

MEM_CGROUP_TARGET_SOFTLIMIT);

1044

#if MAX_NUMNODES > 1

1044

#if MAX_NUMNODES > 1

1045

do_numainfo = mem_cgroup_event_ratelimit(memcg,

1045

do_numainfo = mem_cgroup_event_ratelimit(memcg,

1046

MEM_CGROUP_TARGET_NUMAINFO);

1046

MEM_CGROUP_TARGET_NUMAINFO);

1047

#endif

1047

#endif

1048

preempt_enable();

1048

preempt_enable();

1049

1050

mem_cgroup_threshold(memcg);

1050

mem_cgroup_threshold(memcg);

1051

if (unlikely(do_softlimit))

1051

if (unlikely(do_softlimit))

1052

mem_cgroup_update_tree(memcg, page);

1052

mem_cgroup_update_tree(memcg, page);

1053

#if MAX_NUMNODES > 1

1053

#if MAX_NUMNODES > 1

1054

if (unlikely(do_numainfo))

1054

if (unlikely(do_numainfo))

1055

atomic_inc(&memcg->numainfo_events);

1055

atomic_inc(&memcg->numainfo_events);

1056

#endif

1056

#endif

1057

} else

1057

} else

1058

preempt_enable();

1058

preempt_enable();

1059

}

1059

}

1060

1061

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

1061

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

1062

{

1062

{

1063

/*

1063

/*

1064

* mm_update_next_owner() may clear mm->owner to NULL

1064

* mm_update_next_owner() may clear mm->owner to NULL

1065

* if it races with swapoff, page migration, etc.

1065

* if it races with swapoff, page migration, etc.

1066

* So this can be called with p == NULL.

1066

* So this can be called with p == NULL.

1067

*/

1067

*/

1068

if (unlikely(!p))

1068

if (unlikely(!p))

1069

return NULL;

1069

return NULL;

1070

1071

return mem_cgroup_from_css(task_css(p, memory_cgrp_id));

1071

return mem_cgroup_from_css(task_css(p, memory_cgrp_id));

1072

}

1072

}

1073

1074

static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)

1074

static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)

1075

{

1075

{

1076

struct mem_cgroup *memcg = NULL;

1076

struct mem_cgroup *memcg = NULL;

1077

1078

rcu_read_lock();

1078

rcu_read_lock();

1079

do {

1079

do {

1080

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1080

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1081

if (unlikely(!memcg))

1081

if (unlikely(!memcg))

1082

memcg = root_mem_cgroup;

1082

memcg = root_mem_cgroup;

1083

} while (!css_tryget(&memcg->css));

1083

} while (!css_tryget(&memcg->css));

1084

rcu_read_unlock();

1084

rcu_read_unlock();

1085

return memcg;

1085

return memcg;

1086

}

1086

}

1087

1088

/*

1088

/*

1089

* Returns a next (in a pre-order walk) alive memcg (with elevated css

1089

* Returns a next (in a pre-order walk) alive memcg (with elevated css

1090

* ref. count) or NULL if the whole root's subtree has been visited.

1090

* ref. count) or NULL if the whole root's subtree has been visited.

1091

*

1091

*

1092

* helper function to be used by mem_cgroup_iter

1092

* helper function to be used by mem_cgroup_iter

1093

*/

1093

*/

1094

static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,

1094

static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,

1095

struct mem_cgroup *last_visited)

1095

struct mem_cgroup *last_visited)

1096

{

1096

{

1097

struct cgroup_subsys_state *prev_css, *next_css;

1097

struct cgroup_subsys_state *prev_css, *next_css;

1098

1099

prev_css = last_visited ? &last_visited->css : NULL;

1099

prev_css = last_visited ? &last_visited->css : NULL;

1100

skip_node:

1100

skip_node:

1101

next_css = css_next_descendant_pre(prev_css, &root->css);

1101

next_css = css_next_descendant_pre(prev_css, &root->css);

1102

1103

/*

1103

/*

1104

* Even if we found a group we have to make sure it is

1104

* Even if we found a group we have to make sure it is

1105

* alive. css && !memcg means that the groups should be

1105

* alive. css && !memcg means that the groups should be

1106

* skipped and we should continue the tree walk.

1106

* skipped and we should continue the tree walk.

1107

* last_visited css is safe to use because it is

1107

* last_visited css is safe to use because it is

1108

* protected by css_get and the tree walk is rcu safe.

1108

* protected by css_get and the tree walk is rcu safe.

1109

*

1109

*

1110

* We do not take a reference on the root of the tree walk

1110

* We do not take a reference on the root of the tree walk

1111

* because we might race with the root removal when it would

1111

* because we might race with the root removal when it would

1112

* be the only node in the iterated hierarchy and mem_cgroup_iter

1112

* be the only node in the iterated hierarchy and mem_cgroup_iter

1113

* would end up in an endless loop because it expects that at

1113

* would end up in an endless loop because it expects that at

1114

* least one valid node will be returned. Root cannot disappear

1114

* least one valid node will be returned. Root cannot disappear

1115

* because caller of the iterator should hold it already so

1115

* because caller of the iterator should hold it already so

1116

* skipping css reference should be safe.

1116

* skipping css reference should be safe.

1117

*/

1117

*/

1118

if (next_css) {

1118

if (next_css) {

1119

if ((next_css == &root->css) ||

1119

if ((next_css == &root->css) ||

1120

((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))

1120

((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))

1121

return mem_cgroup_from_css(next_css);

1121

return mem_cgroup_from_css(next_css);

1122

1123

prev_css = next_css;

1123

prev_css = next_css;

1124

goto skip_node;

1124

goto skip_node;

1125

}

1125

}

1126

1127

return NULL;

1127

return NULL;

1128

}

1128

}

1129

1130

static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)

1130

static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)

1131

{

1131

{

1132

/*

1132

/*

1133

* When a group in the hierarchy below root is destroyed, the

1133

* When a group in the hierarchy below root is destroyed, the

1134

* hierarchy iterator can no longer be trusted since it might

1134

* hierarchy iterator can no longer be trusted since it might

1135

* have pointed to the destroyed group. Invalidate it.

1135

* have pointed to the destroyed group. Invalidate it.

1136

*/

1136

*/

1137

atomic_inc(&root->dead_count);

1137

atomic_inc(&root->dead_count);

1138

}

1138

}

1139

1140

static struct mem_cgroup *

1140

static struct mem_cgroup *

1141

mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,

1141

mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,

1142

struct mem_cgroup *root,

1142

struct mem_cgroup *root,

1143

int *sequence)

1143

int *sequence)

1144

{

1144

{

1145

struct mem_cgroup *position = NULL;

1145

struct mem_cgroup *position = NULL;

1146

/*

1146

/*

1147

* A cgroup destruction happens in two stages: offlining and

1147

* A cgroup destruction happens in two stages: offlining and

1148

* release. They are separated by a RCU grace period.

1148

* release. They are separated by a RCU grace period.

1149

*

1149

*

1150

* If the iterator is valid, we may still race with an

1150

* If the iterator is valid, we may still race with an

1151

* offlining. The RCU lock ensures the object won't be

1151

* offlining. The RCU lock ensures the object won't be

1152

* released, tryget will fail if we lost the race.

1152

* released, tryget will fail if we lost the race.

1153

*/

1153

*/

1154

*sequence = atomic_read(&root->dead_count);

1154

*sequence = atomic_read(&root->dead_count);

1155

if (iter->last_dead_count == *sequence) {

1155

if (iter->last_dead_count == *sequence) {

1156

smp_rmb();

1156

smp_rmb();

1157

position = iter->last_visited;

1157

position = iter->last_visited;

1158

1159

/*

1159

/*

1160

* We cannot take a reference to root because we might race

1160

* We cannot take a reference to root because we might race

1161

* with root removal and returning NULL would end up in

1161

* with root removal and returning NULL would end up in

1162

* an endless loop on the iterator user level when root

1162

* an endless loop on the iterator user level when root

1163

* would be returned all the time.

1163

* would be returned all the time.

1164

*/

1164

*/

1165

if (position && position != root &&

1165

if (position && position != root &&

1166

!css_tryget(&position->css))

1166

!css_tryget(&position->css))

1167

position = NULL;

1167

position = NULL;

1168

}

1168

}

1169

return position;

1169

return position;

1170

}

1170

}

1171

1172

static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,

1172

static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,

1173

struct mem_cgroup *last_visited,

1173

struct mem_cgroup *last_visited,

1174

struct mem_cgroup *new_position,

1174

struct mem_cgroup *new_position,

1175

struct mem_cgroup *root,

1175

struct mem_cgroup *root,

1176

int sequence)

1176

int sequence)

1177

{

1177

{

1178

/* root reference counting symmetric to mem_cgroup_iter_load */

1178

/* root reference counting symmetric to mem_cgroup_iter_load */

1179

if (last_visited && last_visited != root)

1179

if (last_visited && last_visited != root)

1180

css_put(&last_visited->css);

1180

css_put(&last_visited->css);

1181

/*

1181

/*

1182

* We store the sequence count from the time @last_visited was

1182

* We store the sequence count from the time @last_visited was

1183

* loaded successfully instead of rereading it here so that we

1183

* loaded successfully instead of rereading it here so that we

1184

* don't lose destruction events in between. We could have

1184

* don't lose destruction events in between. We could have

1185

* raced with the destruction of @new_position after all.

1185

* raced with the destruction of @new_position after all.

1186

*/

1186

*/

1187

iter->last_visited = new_position;

1187

iter->last_visited = new_position;

1188

smp_wmb();

1188

smp_wmb();

1189

iter->last_dead_count = sequence;

1189

iter->last_dead_count = sequence;

1190

}

1190

}

1191

1192

/**

1192

/**

1193

* mem_cgroup_iter - iterate over memory cgroup hierarchy

1193

* mem_cgroup_iter - iterate over memory cgroup hierarchy

1194

* @root: hierarchy root

1194

* @root: hierarchy root

1195

* @prev: previously returned memcg, NULL on first invocation

1195

* @prev: previously returned memcg, NULL on first invocation

1196

* @reclaim: cookie for shared reclaim walks, NULL for full walks

1196

* @reclaim: cookie for shared reclaim walks, NULL for full walks

1197

*

1197

*

1198

* Returns references to children of the hierarchy below @root, or

1198

* Returns references to children of the hierarchy below @root, or

1199

* @root itself, or %NULL after a full round-trip.

1199

* @root itself, or %NULL after a full round-trip.

1200

*

1200

*

1201

* Caller must pass the return value in @prev on subsequent

1201

* Caller must pass the return value in @prev on subsequent

1202

* invocations for reference counting, or use mem_cgroup_iter_break()

1202

* invocations for reference counting, or use mem_cgroup_iter_break()

1203

* to cancel a hierarchy walk before the round-trip is complete.

1203

* to cancel a hierarchy walk before the round-trip is complete.

1204

*

1204

*

1205

* Reclaimers can specify a zone and a priority level in @reclaim to

1205

* Reclaimers can specify a zone and a priority level in @reclaim to

1206

* divide up the memcgs in the hierarchy among all concurrent

1206

* divide up the memcgs in the hierarchy among all concurrent

1207

* reclaimers operating on the same zone and priority.

1207

* reclaimers operating on the same zone and priority.

1208

*/

1208

*/

1209

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

1209

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

1210

struct mem_cgroup *prev,

1210

struct mem_cgroup *prev,

1211

struct mem_cgroup_reclaim_cookie *reclaim)

1211

struct mem_cgroup_reclaim_cookie *reclaim)

1212

{

1212

{

1213

struct mem_cgroup *memcg = NULL;

1213

struct mem_cgroup *memcg = NULL;

1214

struct mem_cgroup *last_visited = NULL;

1214

struct mem_cgroup *last_visited = NULL;

1215

1216

if (mem_cgroup_disabled())

1216

if (mem_cgroup_disabled())

1217

return NULL;

1217

return NULL;

1218

1219

if (!root)

1219

if (!root)

1220

root = root_mem_cgroup;

1220

root = root_mem_cgroup;

1221

1222

if (prev && !reclaim)

1222

if (prev && !reclaim)

1223

last_visited = prev;

1223

last_visited = prev;

1224

1225

if (!root->use_hierarchy && root != root_mem_cgroup) {

1225

if (!root->use_hierarchy && root != root_mem_cgroup) {

1226

if (prev)

1226

if (prev)

1227

goto out_css_put;

1227

goto out_css_put;

1228

return root;

1228

return root;

1229

}

1229

}

1230

1231

rcu_read_lock();

1231

rcu_read_lock();

1232

while (!memcg) {

1232

while (!memcg) {

1233

struct mem_cgroup_reclaim_iter *uninitialized_var(iter);

1233

struct mem_cgroup_reclaim_iter *uninitialized_var(iter);

1234

int uninitialized_var(seq);

1234

int uninitialized_var(seq);

1235

1236

if (reclaim) {

1236

if (reclaim) {

1237

int nid = zone_to_nid(reclaim->zone);

1237

int nid = zone_to_nid(reclaim->zone);

1238

int zid = zone_idx(reclaim->zone);

1238

int zid = zone_idx(reclaim->zone);

1239

struct mem_cgroup_per_zone *mz;

1239

struct mem_cgroup_per_zone *mz;

1240

1241

mz = mem_cgroup_zoneinfo(root, nid, zid);

1241

mz = mem_cgroup_zoneinfo(root, nid, zid);

1242

iter = &mz->reclaim_iter[reclaim->priority];

1242

iter = &mz->reclaim_iter[reclaim->priority];

1243

if (prev && reclaim->generation != iter->generation) {

1243

if (prev && reclaim->generation != iter->generation) {

1244

iter->last_visited = NULL;

1244

iter->last_visited = NULL;

1245

goto out_unlock;

1245

goto out_unlock;

1246

}

1246

}

1247

1248

last_visited = mem_cgroup_iter_load(iter, root, &seq);

1248

last_visited = mem_cgroup_iter_load(iter, root, &seq);

1249

}

1249

}

1250

1251

memcg = __mem_cgroup_iter_next(root, last_visited);

1251

memcg = __mem_cgroup_iter_next(root, last_visited);

1252

1253

if (reclaim) {

1253

if (reclaim) {

1254

mem_cgroup_iter_update(iter, last_visited, memcg, root,

1254

mem_cgroup_iter_update(iter, last_visited, memcg, root,

1255

seq);

1255

seq);

1256

1257

if (!memcg)

1257

if (!memcg)

1258

iter->generation++;

1258

iter->generation++;

1259

else if (!prev && memcg)

1259

else if (!prev && memcg)

1260

reclaim->generation = iter->generation;

1260

reclaim->generation = iter->generation;

1261

}

1261

}

1262

1263

if (prev && !memcg)

1263

if (prev && !memcg)

1264

goto out_unlock;

1264

goto out_unlock;

1265

}

1265

}

1266

out_unlock:

1266

out_unlock:

1267

rcu_read_unlock();

1267

rcu_read_unlock();

1268

out_css_put:

1268

out_css_put:

1269

if (prev && prev != root)

1269

if (prev && prev != root)

1270

css_put(&prev->css);

1270

css_put(&prev->css);

1271

1272

return memcg;

1272

return memcg;

1273

}

1273

}

1274

1275

/**

1275

/**

1276

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

1276

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

1277

* @root: hierarchy root

1277

* @root: hierarchy root

1278

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

1278

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

1279

*/

1279

*/

1280

void mem_cgroup_iter_break(struct mem_cgroup *root,

1280

void mem_cgroup_iter_break(struct mem_cgroup *root,

1281

struct mem_cgroup *prev)

1281

struct mem_cgroup *prev)

1282

{

1282

{

1283

if (!root)

1283

if (!root)

1284

root = root_mem_cgroup;

1284

root = root_mem_cgroup;

1285

if (prev && prev != root)

1285

if (prev && prev != root)

1286

css_put(&prev->css);

1286

css_put(&prev->css);

1287

}

1287

}

1288

1289

/*

1289

/*

1290

* Iteration constructs for visiting all cgroups (under a tree). If

1290

* Iteration constructs for visiting all cgroups (under a tree). If

1291

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1291

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1292

* be used for reference counting.

1292

* be used for reference counting.

1293

*/

1293

*/

1294

#define for_each_mem_cgroup_tree(iter, root) \

1294

#define for_each_mem_cgroup_tree(iter, root) \

1295

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1295

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1296

iter != NULL; \

1296

iter != NULL; \

1297

iter = mem_cgroup_iter(root, iter, NULL))

1297

iter = mem_cgroup_iter(root, iter, NULL))

1298

1299

#define for_each_mem_cgroup(iter) \

1299

#define for_each_mem_cgroup(iter) \

1300

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1300

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1301

iter != NULL; \

1301

iter != NULL; \

1302

iter = mem_cgroup_iter(NULL, iter, NULL))

1302

iter = mem_cgroup_iter(NULL, iter, NULL))

1303

1304

void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1304

void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1305

{

1305

{

1306

struct mem_cgroup *memcg;

1306

struct mem_cgroup *memcg;

1307

1308

rcu_read_lock();

1308

rcu_read_lock();

1309

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1309

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1310

if (unlikely(!memcg))

1310

if (unlikely(!memcg))

1311

goto out;

1311

goto out;

1312

1313

switch (idx) {

1313

switch (idx) {

1314

case PGFAULT:

1314

case PGFAULT:

1315

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1315

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1316

break;

1316

break;

1317

case PGMAJFAULT:

1317

case PGMAJFAULT:

1318

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1318

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1319

break;

1319

break;

1320

default:

1320

default:

1321

BUG();

1321

BUG();

1322

}

1322

}

1323

out:

1323

out:

1324

rcu_read_unlock();

1324

rcu_read_unlock();

1325

}

1325

}

1326

EXPORT_SYMBOL(__mem_cgroup_count_vm_event);

1326

EXPORT_SYMBOL(__mem_cgroup_count_vm_event);

1327

1328

/**

1328

/**

1329

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1329

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1330

* @zone: zone of the wanted lruvec

1330

* @zone: zone of the wanted lruvec

1331

* @memcg: memcg of the wanted lruvec

1331

* @memcg: memcg of the wanted lruvec

1332

*

1332

*

1333

* Returns the lru list vector holding pages for the given @zone and

1333

* Returns the lru list vector holding pages for the given @zone and

1334

* @mem. This can be the global zone lruvec, if the memory controller

1334

* @mem. This can be the global zone lruvec, if the memory controller

1335

* is disabled.

1335

* is disabled.

1336

*/

1336

*/

1337

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1337

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1338

struct mem_cgroup *memcg)

1338

struct mem_cgroup *memcg)

1339

{

1339

{

1340

struct mem_cgroup_per_zone *mz;

1340

struct mem_cgroup_per_zone *mz;

1341

struct lruvec *lruvec;

1341

struct lruvec *lruvec;

1342

1343

if (mem_cgroup_disabled()) {

1343

if (mem_cgroup_disabled()) {

1344

lruvec = &zone->lruvec;

1344

lruvec = &zone->lruvec;

1345

goto out;

1345

goto out;

1346

}

1346

}

1347

1348

mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));

1348

mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));

1349

lruvec = &mz->lruvec;

1349

lruvec = &mz->lruvec;

1350

out:

1350

out:

1351

/*

1351

/*

1352

* Since a node can be onlined after the mem_cgroup was created,

1352

* Since a node can be onlined after the mem_cgroup was created,

1353

* we have to be prepared to initialize lruvec->zone here;

1353

* we have to be prepared to initialize lruvec->zone here;

1354

* and if offlined then reonlined, we need to reinitialize it.

1354

* and if offlined then reonlined, we need to reinitialize it.

1355

*/

1355

*/

1356

if (unlikely(lruvec->zone != zone))

1356

if (unlikely(lruvec->zone != zone))

1357

lruvec->zone = zone;

1357

lruvec->zone = zone;

1358

return lruvec;

1358

return lruvec;

1359

}

1359

}

1360

1361

/*

1361

/*

1362

* Following LRU functions are allowed to be used without PCG_LOCK.

1362

* Following LRU functions are allowed to be used without PCG_LOCK.

1363

* Operations are called by routine of global LRU independently from memcg.

1363

* Operations are called by routine of global LRU independently from memcg.

1364

* What we have to take care of here is validness of pc->mem_cgroup.

1364

* What we have to take care of here is validness of pc->mem_cgroup.

1365

*

1365

*

1366

* Changes to pc->mem_cgroup happens when

1366

* Changes to pc->mem_cgroup happens when

1367

* 1. charge

1367

* 1. charge

1368

* 2. moving account

1368

* 2. moving account

1369

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

1369

* In typical case, "charge" is done before add-to-lru. Exception is SwapCache.

1370

* It is added to LRU before charge.

1370

* It is added to LRU before charge.

1371

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

1371

* If PCG_USED bit is not set, page_cgroup is not added to this private LRU.

1372

* When moving account, the page is not on LRU. It's isolated.

1372

* When moving account, the page is not on LRU. It's isolated.

1373

*/

1373

*/

1374

1375

/**

1375

/**

1376

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1376

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1377

* @page: the page

1377

* @page: the page

1378

* @zone: zone of the page

1378

* @zone: zone of the page

1379

*/

1379

*/

1380

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1380

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1381

{

1381

{

1382

struct mem_cgroup_per_zone *mz;

1382

struct mem_cgroup_per_zone *mz;

1383

struct mem_cgroup *memcg;

1383

struct mem_cgroup *memcg;

1384

struct page_cgroup *pc;

1384

struct page_cgroup *pc;

1385

struct lruvec *lruvec;

1385

struct lruvec *lruvec;

1386

1387

if (mem_cgroup_disabled()) {

1387

if (mem_cgroup_disabled()) {

1388

lruvec = &zone->lruvec;

1388

lruvec = &zone->lruvec;

1389

goto out;

1389

goto out;

1390

}

1390

}

1391

1392

pc = lookup_page_cgroup(page);

1392

pc = lookup_page_cgroup(page);

1393

memcg = pc->mem_cgroup;

1393

memcg = pc->mem_cgroup;

1394

1395

/*

1395

/*

1396

* Surreptitiously switch any uncharged offlist page to root:

1396

* Surreptitiously switch any uncharged offlist page to root:

1397

* an uncharged page off lru does nothing to secure

1397

* an uncharged page off lru does nothing to secure

1398

* its former mem_cgroup from sudden removal.

1398

* its former mem_cgroup from sudden removal.

1399

*

1399

*

1400

* Our caller holds lru_lock, and PageCgroupUsed is updated

1400

* Our caller holds lru_lock, and PageCgroupUsed is updated

1401

* under page_cgroup lock: between them, they make all uses

1401

* under page_cgroup lock: between them, they make all uses

1402

* of pc->mem_cgroup safe.

1402

* of pc->mem_cgroup safe.

1403

*/

1403

*/

1404

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1404

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1405

pc->mem_cgroup = memcg = root_mem_cgroup;

1405

pc->mem_cgroup = memcg = root_mem_cgroup;

1406

1407

mz = page_cgroup_zoneinfo(memcg, page);

1407

mz = page_cgroup_zoneinfo(memcg, page);

1408

lruvec = &mz->lruvec;

1408

lruvec = &mz->lruvec;

1409

out:

1409

out:

1410

/*

1410

/*

1411

* Since a node can be onlined after the mem_cgroup was created,

1411

* Since a node can be onlined after the mem_cgroup was created,

1412

* we have to be prepared to initialize lruvec->zone here;

1412

* we have to be prepared to initialize lruvec->zone here;

1413

* and if offlined then reonlined, we need to reinitialize it.

1413

* and if offlined then reonlined, we need to reinitialize it.

1414

*/

1414

*/

1415

if (unlikely(lruvec->zone != zone))

1415

if (unlikely(lruvec->zone != zone))

1416

lruvec->zone = zone;

1416

lruvec->zone = zone;

1417

return lruvec;

1417

return lruvec;

1418

}

1418

}

1419

1420

/**

1420

/**

1421

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1421

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1422

* @lruvec: mem_cgroup per zone lru vector

1422

* @lruvec: mem_cgroup per zone lru vector

1423

* @lru: index of lru list the page is sitting on

1423

* @lru: index of lru list the page is sitting on

1424

* @nr_pages: positive when adding or negative when removing

1424

* @nr_pages: positive when adding or negative when removing

1425

*

1425

*

1426

* This function must be called when a page is added to or removed from an

1426

* This function must be called when a page is added to or removed from an

1427

* lru list.

1427

* lru list.

1428

*/

1428

*/

1429

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1429

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1430

int nr_pages)

1430

int nr_pages)

1431

{

1431

{

1432

struct mem_cgroup_per_zone *mz;

1432

struct mem_cgroup_per_zone *mz;

1433

unsigned long *lru_size;

1433

unsigned long *lru_size;

1434

1435

if (mem_cgroup_disabled())

1435

if (mem_cgroup_disabled())

1436

return;

1436

return;

1437

1438

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1438

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1439

lru_size = mz->lru_size + lru;

1439

lru_size = mz->lru_size + lru;

1440

*lru_size += nr_pages;

1440

*lru_size += nr_pages;

1441

VM_BUG_ON((long)(*lru_size) < 0);

1441

VM_BUG_ON((long)(*lru_size) < 0);

1442

}

1442

}

1443

1444

/*

1444

/*

1445

* Checks whether given mem is same or in the root_mem_cgroup's

1445

* Checks whether given mem is same or in the root_mem_cgroup's

1446

* hierarchy subtree

1446

* hierarchy subtree

1447

*/

1447

*/

1448

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1448

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1449

struct mem_cgroup *memcg)

1449

struct mem_cgroup *memcg)

1450

{

1450

{

1451

if (root_memcg == memcg)

1451

if (root_memcg == memcg)

1452

return true;

1452

return true;

1453

if (!root_memcg->use_hierarchy || !memcg)

1453

if (!root_memcg->use_hierarchy || !memcg)

1454

return false;

1454

return false;

1455

return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);

1455

return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);

1456

}

1456

}

1457

1458

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1458

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1459

struct mem_cgroup *memcg)

1459

struct mem_cgroup *memcg)

1460

{

1460

{

1461

bool ret;

1461

bool ret;

1462

1463

rcu_read_lock();

1463

rcu_read_lock();

1464

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1464

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1465

rcu_read_unlock();

1465

rcu_read_unlock();

1466

return ret;

1466

return ret;

1467

}

1467

}

1468

1469

bool task_in_mem_cgroup(struct task_struct *task,

1469

bool task_in_mem_cgroup(struct task_struct *task,

1470

const struct mem_cgroup *memcg)

1470

const struct mem_cgroup *memcg)

1471

{

1471

{

1472

struct mem_cgroup *curr = NULL;

1472

struct mem_cgroup *curr = NULL;

1473

struct task_struct *p;

1473

struct task_struct *p;

1474

bool ret;

1474

bool ret;

1475

1476

p = find_lock_task_mm(task);

1476

p = find_lock_task_mm(task);

1477

if (p) {

1477

if (p) {

1478

curr = get_mem_cgroup_from_mm(p->mm);

1478

curr = get_mem_cgroup_from_mm(p->mm);

1479

task_unlock(p);

1479

task_unlock(p);

1480

} else {

1480

} else {

1481

/*

1481

/*

1482

* All threads may have already detached their mm's, but the oom

1482

* All threads may have already detached their mm's, but the oom

1483

* killer still needs to detect if they have already been oom

1483

* killer still needs to detect if they have already been oom

1484

* killed to prevent needlessly killing additional tasks.

1484

* killed to prevent needlessly killing additional tasks.

1485

*/

1485

*/

1486

rcu_read_lock();

1486

rcu_read_lock();

1487

curr = mem_cgroup_from_task(task);

1487

curr = mem_cgroup_from_task(task);

1488

if (curr)

1488

if (curr)

1489

css_get(&curr->css);

1489

css_get(&curr->css);

1490

rcu_read_unlock();

1490

rcu_read_unlock();

1491

}

1491

}

1492

/*

1492

/*

1493

* We should check use_hierarchy of "memcg" not "curr". Because checking

1493

* We should check use_hierarchy of "memcg" not "curr". Because checking

1494

* use_hierarchy of "curr" here make this function true if hierarchy is

1494

* use_hierarchy of "curr" here make this function true if hierarchy is

1495

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1495

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1496

* hierarchy(even if use_hierarchy is disabled in "memcg").

1496

* hierarchy(even if use_hierarchy is disabled in "memcg").

1497

*/

1497

*/

1498

ret = mem_cgroup_same_or_subtree(memcg, curr);

1498

ret = mem_cgroup_same_or_subtree(memcg, curr);

1499

css_put(&curr->css);

1499

css_put(&curr->css);

1500

return ret;

1500

return ret;

1501

}

1501

}

1502

1503

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1503

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1504

{

1504

{

1505

unsigned long inactive_ratio;

1505

unsigned long inactive_ratio;

1506

unsigned long inactive;

1506

unsigned long inactive;

1507

unsigned long active;

1507

unsigned long active;

1508

unsigned long gb;

1508

unsigned long gb;

1509

1510

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1510

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1511

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1511

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1512

1513

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1513

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1514

if (gb)

1514

if (gb)

1515

inactive_ratio = int_sqrt(10 * gb);

1515

inactive_ratio = int_sqrt(10 * gb);

1516

else

1516

else

1517

inactive_ratio = 1;

1517

inactive_ratio = 1;

1518

1519

return inactive * inactive_ratio < active;

1519

return inactive * inactive_ratio < active;

1520

}

1520

}

1521

1522

#define mem_cgroup_from_res_counter(counter, member) \

1522

#define mem_cgroup_from_res_counter(counter, member) \

1523

container_of(counter, struct mem_cgroup, member)

1523

container_of(counter, struct mem_cgroup, member)

1524

1525

/**

1525

/**

1526

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1526

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1527

* @memcg: the memory cgroup

1527

* @memcg: the memory cgroup

1528

*

1528

*

1529

* Returns the maximum amount of memory @mem can be charged with, in

1529

* Returns the maximum amount of memory @mem can be charged with, in

1530

* pages.

1530

* pages.

1531

*/

1531

*/

1532

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1532

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1533

{

1533

{

1534

unsigned long long margin;

1534

unsigned long long margin;

1535

1536

margin = res_counter_margin(&memcg->res);

1536

margin = res_counter_margin(&memcg->res);

1537

if (do_swap_account)

1537

if (do_swap_account)

1538

margin = min(margin, res_counter_margin(&memcg->memsw));

1538

margin = min(margin, res_counter_margin(&memcg->memsw));

1539

return margin >> PAGE_SHIFT;

1539

return margin >> PAGE_SHIFT;

1540

}

1540

}

1541

1542

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1542

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1543

{

1543

{

1544

/* root ? */

1544

/* root ? */

1545

if (!css_parent(&memcg->css))

1545

if (!css_parent(&memcg->css))

1546

return vm_swappiness;

1546

return vm_swappiness;

1547

1548

return memcg->swappiness;

1548

return memcg->swappiness;

1549

}

1549

}

1550

1551

/*

1551

/*

1552

* memcg->moving_account is used for checking possibility that some thread is

1552

* memcg->moving_account is used for checking possibility that some thread is

1553

* calling move_account(). When a thread on CPU-A starts moving pages under

1553

* calling move_account(). When a thread on CPU-A starts moving pages under

1554

* a memcg, other threads should check memcg->moving_account under

1554

* a memcg, other threads should check memcg->moving_account under

1555

* rcu_read_lock(), like this:

1555

* rcu_read_lock(), like this:

1556

*

1556

*

1557

* CPU-A CPU-B

1557

* CPU-A CPU-B

1558

* rcu_read_lock()

1558

* rcu_read_lock()

1559

* memcg->moving_account+1 if (memcg->mocing_account)

1559

* memcg->moving_account+1 if (memcg->mocing_account)

1560

* take heavy locks.

1560

* take heavy locks.

1561

* synchronize_rcu() update something.

1561

* synchronize_rcu() update something.

1562

* rcu_read_unlock()

1562

* rcu_read_unlock()

1563

* start move here.

1563

* start move here.

1564

*/

1564

*/

1565

1566

/* for quick checking without looking up memcg */

1566

/* for quick checking without looking up memcg */

1567

atomic_t memcg_moving __read_mostly;

1567

atomic_t memcg_moving __read_mostly;

1568

1569

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1569

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1570

{

1570

{

1571

atomic_inc(&memcg_moving);

1571

atomic_inc(&memcg_moving);

1572

atomic_inc(&memcg->moving_account);

1572

atomic_inc(&memcg->moving_account);

1573

synchronize_rcu();

1573

synchronize_rcu();

1574

}

1574

}

1575

1576

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1576

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1577

{

1577

{

1578

/*

1578

/*

1579

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1579

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1580

* We check NULL in callee rather than caller.

1580

* We check NULL in callee rather than caller.

1581

*/

1581

*/

1582

if (memcg) {

1582

if (memcg) {

1583

atomic_dec(&memcg_moving);

1583

atomic_dec(&memcg_moving);

1584

atomic_dec(&memcg->moving_account);

1584

atomic_dec(&memcg->moving_account);

1585

}

1585

}

1586

}

1586

}

1587

1588

/*

1588

/*

1589

* 2 routines for checking "mem" is under move_account() or not.

1589

* 2 routines for checking "mem" is under move_account() or not.

1590

*

1590

*

1591

* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This

1591

* mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This

1592

* is used for avoiding races in accounting. If true,

1592

* is used for avoiding races in accounting. If true,

1593

* pc->mem_cgroup may be overwritten.

1593

* pc->mem_cgroup may be overwritten.

1594

*

1594

*

1595

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1595

* mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or

1596

* under hierarchy of moving cgroups. This is for

1596

* under hierarchy of moving cgroups. This is for

1597

* waiting at hith-memory prressure caused by "move".

1597

* waiting at hith-memory prressure caused by "move".

1598

*/

1598

*/

1599

1600

static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

1600

static bool mem_cgroup_stolen(struct mem_cgroup *memcg)

1601

{

1601

{

1602

VM_BUG_ON(!rcu_read_lock_held());

1602

VM_BUG_ON(!rcu_read_lock_held());

1603

return atomic_read(&memcg->moving_account) > 0;

1603

return atomic_read(&memcg->moving_account) > 0;

1604

}

1604

}

1605

1606

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1606

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1607

{

1607

{

1608

struct mem_cgroup *from;

1608

struct mem_cgroup *from;

1609

struct mem_cgroup *to;

1609

struct mem_cgroup *to;

1610

bool ret = false;

1610

bool ret = false;

1611

/*

1611

/*

1612

* Unlike task_move routines, we access mc.to, mc.from not under

1612

* Unlike task_move routines, we access mc.to, mc.from not under

1613

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1613

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1614

*/

1614

*/

1615

spin_lock(&mc.lock);

1615

spin_lock(&mc.lock);

1616

from = mc.from;

1616

from = mc.from;

1617

to = mc.to;

1617

to = mc.to;

1618

if (!from)

1618

if (!from)

1619

goto unlock;

1619

goto unlock;

1620

1621

ret = mem_cgroup_same_or_subtree(memcg, from)

1621

ret = mem_cgroup_same_or_subtree(memcg, from)

1622

|| mem_cgroup_same_or_subtree(memcg, to);

1622

|| mem_cgroup_same_or_subtree(memcg, to);

1623

unlock:

1623

unlock:

1624

spin_unlock(&mc.lock);

1624

spin_unlock(&mc.lock);

1625

return ret;

1625

return ret;

1626

}

1626

}

1627

1628

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1628

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1629

{

1629

{

1630

if (mc.moving_task && current != mc.moving_task) {

1630

if (mc.moving_task && current != mc.moving_task) {

1631

if (mem_cgroup_under_move(memcg)) {

1631

if (mem_cgroup_under_move(memcg)) {

1632

DEFINE_WAIT(wait);

1632

DEFINE_WAIT(wait);

1633

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1633

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1634

/* moving charge context might have finished. */

1634

/* moving charge context might have finished. */

1635

if (mc.moving_task)

1635

if (mc.moving_task)

1636

schedule();

1636

schedule();

1637

finish_wait(&mc.waitq, &wait);

1637

finish_wait(&mc.waitq, &wait);

1638

return true;

1638

return true;

1639

}

1639

}

1640

}

1640

}

1641

return false;

1641

return false;

1642

}

1642

}

1643

1644

/*

1644

/*

1645

* Take this lock when

1645

* Take this lock when

1646

* - a code tries to modify page's memcg while it's USED.

1646

* - a code tries to modify page's memcg while it's USED.

1647

* - a code tries to modify page state accounting in a memcg.

1647

* - a code tries to modify page state accounting in a memcg.

1648

* see mem_cgroup_stolen(), too.

1648

* see mem_cgroup_stolen(), too.

1649

*/

1649

*/

1650

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1650

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1651

unsigned long *flags)

1651

unsigned long *flags)

1652

{

1652

{

1653

spin_lock_irqsave(&memcg->move_lock, *flags);

1653

spin_lock_irqsave(&memcg->move_lock, *flags);

1654

}

1654

}

1655

1656

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1656

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1657

unsigned long *flags)

1657

unsigned long *flags)

1658

{

1658

{

1659

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1659

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1660

}

1660

}

1661

1662

#define K(x) ((x) << (PAGE_SHIFT-10))

1662

#define K(x) ((x) << (PAGE_SHIFT-10))

1663

/**

1663

/**

1664

* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.

1664

* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.

1665

* @memcg: The memory cgroup that went over limit

1665

* @memcg: The memory cgroup that went over limit

1666

* @p: Task that is going to be killed

1666

* @p: Task that is going to be killed

1667

*

1667

*

1668

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1668

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1669

* enabled

1669

* enabled

1670

*/

1670

*/

1671

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1671

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1672

{

1672

{

1673

/* oom_info_lock ensures that parallel ooms do not interleave */

1673

/* oom_info_lock ensures that parallel ooms do not interleave */

1674

static DEFINE_MUTEX(oom_info_lock);

1674

static DEFINE_MUTEX(oom_info_lock);

1675

struct mem_cgroup *iter;

1675

struct mem_cgroup *iter;

1676

unsigned int i;

1676

unsigned int i;

1677

1678

if (!p)

1678

if (!p)

1679

return;

1679

return;

1680

1681

mutex_lock(&oom_info_lock);

1681

mutex_lock(&oom_info_lock);

1682

rcu_read_lock();

1682

rcu_read_lock();

1683

1684

pr_info("Task in ");

1684

pr_info("Task in ");

1685

pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));

1685

pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));

1686

pr_info(" killed as a result of limit of ");

1686

pr_info(" killed as a result of limit of ");

1687

pr_cont_cgroup_path(memcg->css.cgroup);

1687

pr_cont_cgroup_path(memcg->css.cgroup);

1688

pr_info("\n");

1688

pr_info("\n");

1689

1690

rcu_read_unlock();

1690

rcu_read_unlock();

1691

1692

pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",

1692

pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",

1693

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1693

res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,

1694

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1694

res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,

1695

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1695

res_counter_read_u64(&memcg->res, RES_FAILCNT));

1696

pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",

1696

pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",

1697

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1697

res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,

1698

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1698

res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,

1699

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1699

res_counter_read_u64(&memcg->memsw, RES_FAILCNT));

1700

pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",

1700

pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",

1701

res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,

1701

res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,

1702

res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,

1702

res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,

1703

res_counter_read_u64(&memcg->kmem, RES_FAILCNT));

1703

res_counter_read_u64(&memcg->kmem, RES_FAILCNT));

1704

1705

for_each_mem_cgroup_tree(iter, memcg) {

1705

for_each_mem_cgroup_tree(iter, memcg) {

1706

pr_info("Memory cgroup stats for ");

1706

pr_info("Memory cgroup stats for ");

1707

pr_cont_cgroup_path(iter->css.cgroup);

1707

pr_cont_cgroup_path(iter->css.cgroup);

1708

pr_cont(":");

1708

pr_cont(":");

1709

1710

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

1710

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

1711

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

1711

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

1712

continue;

1712

continue;

1713

pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],

1713

pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],

1714

K(mem_cgroup_read_stat(iter, i)));

1714

K(mem_cgroup_read_stat(iter, i)));

1715

}

1715

}

1716

1717

for (i = 0; i < NR_LRU_LISTS; i++)

1717

for (i = 0; i < NR_LRU_LISTS; i++)

1718

pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],

1718

pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],

1719

K(mem_cgroup_nr_lru_pages(iter, BIT(i))));

1719

K(mem_cgroup_nr_lru_pages(iter, BIT(i))));

1720

1721

pr_cont("\n");

1721

pr_cont("\n");

1722

}

1722

}

1723

mutex_unlock(&oom_info_lock);

1723

mutex_unlock(&oom_info_lock);

1724

}

1724

}

1725

1726

/*

1726

/*

1727

* This function returns the number of memcg under hierarchy tree. Returns

1727

* This function returns the number of memcg under hierarchy tree. Returns

1728

* 1(self count) if no children.

1728

* 1(self count) if no children.

1729

*/

1729

*/

1730

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1730

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1731

{

1731

{

1732

int num = 0;

1732

int num = 0;

1733

struct mem_cgroup *iter;

1733

struct mem_cgroup *iter;

1734

1735

for_each_mem_cgroup_tree(iter, memcg)

1735

for_each_mem_cgroup_tree(iter, memcg)

1736

num++;

1736

num++;

1737

return num;

1737

return num;

1738

}

1738

}

1739

1740

/*

1740

/*

1741

* Return the memory (and swap, if configured) limit for a memcg.

1741

* Return the memory (and swap, if configured) limit for a memcg.

1742

*/

1742

*/

1743

static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1743

static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)

1744

{

1744

{

1745

u64 limit;

1745

u64 limit;

1746

1747

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1747

limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

1748

1749

/*

1749

/*

1750

* Do not consider swap space if we cannot swap due to swappiness

1750

* Do not consider swap space if we cannot swap due to swappiness

1751

*/

1751

*/

1752

if (mem_cgroup_swappiness(memcg)) {

1752

if (mem_cgroup_swappiness(memcg)) {

1753

u64 memsw;

1753

u64 memsw;

1754

1755

limit += total_swap_pages << PAGE_SHIFT;

1755

limit += total_swap_pages << PAGE_SHIFT;

1756

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1756

memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

1757

1758

/*

1758

/*

1759

* If memsw is finite and limits the amount of swap space

1759

* If memsw is finite and limits the amount of swap space

1760

* available to this memcg, return that limit.

1760

* available to this memcg, return that limit.

1761

*/

1761

*/

1762

limit = min(limit, memsw);

1762

limit = min(limit, memsw);

1763

}

1763

}

1764

1765

return limit;

1765

return limit;

1766

}

1766

}

1767

1768

static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1768

static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1769

int order)

1769

int order)

1770

{

1770

{

1771

struct mem_cgroup *iter;

1771

struct mem_cgroup *iter;

1772

unsigned long chosen_points = 0;

1772

unsigned long chosen_points = 0;

1773

unsigned long totalpages;

1773

unsigned long totalpages;

1774

unsigned int points = 0;

1774

unsigned int points = 0;

1775

struct task_struct *chosen = NULL;

1775

struct task_struct *chosen = NULL;

1776

1777

/*

1777

/*

1778

* If current has a pending SIGKILL or is exiting, then automatically

1778

* If current has a pending SIGKILL or is exiting, then automatically

1779

* select it. The goal is to allow it to allocate so that it may

1779

* select it. The goal is to allow it to allocate so that it may

1780

* quickly exit and free its memory.

1780

* quickly exit and free its memory.

1781

*/

1781

*/

1782

if (fatal_signal_pending(current) || current->flags & PF_EXITING) {

1782

if (fatal_signal_pending(current) || current->flags & PF_EXITING) {

1783

set_thread_flag(TIF_MEMDIE);

1783

set_thread_flag(TIF_MEMDIE);

1784

return;

1784

return;

1785

}

1785

}

1786

1787

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1787

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1788

totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;

1788

totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;

1789

for_each_mem_cgroup_tree(iter, memcg) {

1789

for_each_mem_cgroup_tree(iter, memcg) {

1790

struct css_task_iter it;

1790

struct css_task_iter it;

1791

struct task_struct *task;

1791

struct task_struct *task;

1792

1793

css_task_iter_start(&iter->css, &it);

1793

css_task_iter_start(&iter->css, &it);

1794

while ((task = css_task_iter_next(&it))) {

1794

while ((task = css_task_iter_next(&it))) {

1795

switch (oom_scan_process_thread(task, totalpages, NULL,

1795

switch (oom_scan_process_thread(task, totalpages, NULL,

1796

false)) {

1796

false)) {

1797

case OOM_SCAN_SELECT:

1797

case OOM_SCAN_SELECT:

1798

if (chosen)

1798

if (chosen)

1799

put_task_struct(chosen);

1799

put_task_struct(chosen);

1800

chosen = task;

1800

chosen = task;

1801

chosen_points = ULONG_MAX;

1801

chosen_points = ULONG_MAX;

1802

get_task_struct(chosen);

1802

get_task_struct(chosen);

1803

/* fall through */

1803

/* fall through */

1804

case OOM_SCAN_CONTINUE:

1804

case OOM_SCAN_CONTINUE:

1805

continue;

1805

continue;

1806

case OOM_SCAN_ABORT:

1806

case OOM_SCAN_ABORT:

1807

css_task_iter_end(&it);

1807

css_task_iter_end(&it);

1808

mem_cgroup_iter_break(memcg, iter);

1808

mem_cgroup_iter_break(memcg, iter);

1809

if (chosen)

1809

if (chosen)

1810

put_task_struct(chosen);

1810

put_task_struct(chosen);

1811

return;

1811

return;

1812

case OOM_SCAN_OK:

1812

case OOM_SCAN_OK:

1813

break;

1813

break;

1814

};

1814

};

1815

points = oom_badness(task, memcg, NULL, totalpages);

1815

points = oom_badness(task, memcg, NULL, totalpages);

1816

if (!points || points < chosen_points)

1816

if (!points || points < chosen_points)

1817

continue;

1817

continue;

1818

/* Prefer thread group leaders for display purposes */

1818

/* Prefer thread group leaders for display purposes */

1819

if (points == chosen_points &&

1819

if (points == chosen_points &&

1820

thread_group_leader(chosen))

1820

thread_group_leader(chosen))

1821

continue;

1821

continue;

1822

1823

if (chosen)

1823

if (chosen)

1824

put_task_struct(chosen);

1824

put_task_struct(chosen);

1825

chosen = task;

1825

chosen = task;

1826

chosen_points = points;

1826

chosen_points = points;

1827

get_task_struct(chosen);

1827

get_task_struct(chosen);

1828

}

1828

}

1829

css_task_iter_end(&it);

1829

css_task_iter_end(&it);

1830

}

1830

}

1831

1832

if (!chosen)

1832

if (!chosen)

1833

return;

1833

return;

1834

points = chosen_points * 1000 / totalpages;

1834

points = chosen_points * 1000 / totalpages;

1835

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1835

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1836

NULL, "Memory cgroup out of memory");

1836

NULL, "Memory cgroup out of memory");

1837

}

1837

}

1838

1839

static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,

1839

static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,

1840

gfp_t gfp_mask,

1840

gfp_t gfp_mask,

1841

unsigned long flags)

1841

unsigned long flags)

1842

{

1842

{

1843

unsigned long total = 0;

1843

unsigned long total = 0;

1844

bool noswap = false;

1844

bool noswap = false;

1845

int loop;

1845

int loop;

1846

1847

if (flags & MEM_CGROUP_RECLAIM_NOSWAP)

1847

if (flags & MEM_CGROUP_RECLAIM_NOSWAP)

1848

noswap = true;

1848

noswap = true;

1849

if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)

1849

if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)

1850

noswap = true;

1850

noswap = true;

1851

1852

for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {

1852

for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {

1853

if (loop)

1853

if (loop)

1854

drain_all_stock_async(memcg);

1854

drain_all_stock_async(memcg);

1855

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);

1855

total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);

1856

/*

1856

/*

1857

* Allow limit shrinkers, which are triggered directly

1857

* Allow limit shrinkers, which are triggered directly

1858

* by userspace, to catch signals and stop reclaim

1858

* by userspace, to catch signals and stop reclaim

1859

* after minimal progress, regardless of the margin.

1859

* after minimal progress, regardless of the margin.

1860

*/

1860

*/

1861

if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))

1861

if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))

1862

break;

1862

break;

1863

if (mem_cgroup_margin(memcg))

1863

if (mem_cgroup_margin(memcg))

1864

break;

1864

break;

1865

/*

1865

/*

1866

* If nothing was reclaimed after two attempts, there

1866

* If nothing was reclaimed after two attempts, there

1867

* may be no reclaimable pages in this hierarchy.

1867

* may be no reclaimable pages in this hierarchy.

1868

*/

1868

*/

1869

if (loop && !total)

1869

if (loop && !total)

1870

break;

1870

break;

1871

}

1871

}

1872

return total;

1872

return total;

1873

}

1873

}

1874

1875

/**

1875

/**

1876

* test_mem_cgroup_node_reclaimable

1876

* test_mem_cgroup_node_reclaimable

1877

* @memcg: the target memcg

1877

* @memcg: the target memcg

1878

* @nid: the node ID to be checked.

1878

* @nid: the node ID to be checked.

1879

* @noswap : specify true here if the user wants flle only information.

1879

* @noswap : specify true here if the user wants flle only information.

1880

*

1880

*

1881

* This function returns whether the specified memcg contains any

1881

* This function returns whether the specified memcg contains any

1882

* reclaimable pages on a node. Returns true if there are any reclaimable

1882

* reclaimable pages on a node. Returns true if there are any reclaimable

1883

* pages in the node.

1883

* pages in the node.

1884

*/

1884

*/

1885

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1885

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1886

int nid, bool noswap)

1886

int nid, bool noswap)

1887

{

1887

{

1888

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1888

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1889

return true;

1889

return true;

1890

if (noswap || !total_swap_pages)

1890

if (noswap || !total_swap_pages)

1891

return false;

1891

return false;

1892

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1892

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1893

return true;

1893

return true;

1894

return false;

1894

return false;

1895

1896

}

1896

}

1897

#if MAX_NUMNODES > 1

1897

#if MAX_NUMNODES > 1

1898

1899

/*

1899

/*

1900

* Always updating the nodemask is not very good - even if we have an empty

1900

* Always updating the nodemask is not very good - even if we have an empty

1901

* list or the wrong list here, we can start from some node and traverse all

1901

* list or the wrong list here, we can start from some node and traverse all

1902

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1902

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1903

*

1903

*

1904

*/

1904

*/

1905

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1905

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1906

{

1906

{

1907

int nid;

1907

int nid;

1908

/*

1908

/*

1909

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1909

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1910

* pagein/pageout changes since the last update.

1910

* pagein/pageout changes since the last update.

1911

*/

1911

*/

1912

if (!atomic_read(&memcg->numainfo_events))

1912

if (!atomic_read(&memcg->numainfo_events))

1913

return;

1913

return;

1914

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1914

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1915

return;

1915

return;

1916

1917

/* make a nodemask where this memcg uses memory from */

1917

/* make a nodemask where this memcg uses memory from */

1918

memcg->scan_nodes = node_states[N_MEMORY];

1918

memcg->scan_nodes = node_states[N_MEMORY];

1919

1920

for_each_node_mask(nid, node_states[N_MEMORY]) {

1920

for_each_node_mask(nid, node_states[N_MEMORY]) {

1921

1922

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1922

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1923

node_clear(nid, memcg->scan_nodes);

1923

node_clear(nid, memcg->scan_nodes);

1924

}

1924

}

1925

1926

atomic_set(&memcg->numainfo_events, 0);

1926

atomic_set(&memcg->numainfo_events, 0);

1927

atomic_set(&memcg->numainfo_updating, 0);

1927

atomic_set(&memcg->numainfo_updating, 0);

1928

}

1928

}

1929

1930

/*

1930

/*

1931

* Selecting a node where we start reclaim from. Because what we need is just

1931

* Selecting a node where we start reclaim from. Because what we need is just

1932

* reducing usage counter, start from anywhere is O,K. Considering

1932

* reducing usage counter, start from anywhere is O,K. Considering

1933

* memory reclaim from current node, there are pros. and cons.

1933

* memory reclaim from current node, there are pros. and cons.

1934

*

1934

*

1935

* Freeing memory from current node means freeing memory from a node which

1935

* Freeing memory from current node means freeing memory from a node which

1936

* we'll use or we've used. So, it may make LRU bad. And if several threads

1936

* we'll use or we've used. So, it may make LRU bad. And if several threads

1937

* hit limits, it will see a contention on a node. But freeing from remote

1937

* hit limits, it will see a contention on a node. But freeing from remote

1938

* node means more costs for memory reclaim because of memory latency.

1938

* node means more costs for memory reclaim because of memory latency.

1939

*

1939

*

1940

* Now, we use round-robin. Better algorithm is welcomed.

1940

* Now, we use round-robin. Better algorithm is welcomed.

1941

*/

1941

*/

1942

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1942

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1943

{

1943

{

1944

int node;

1944

int node;

1945

1946

mem_cgroup_may_update_nodemask(memcg);

1946

mem_cgroup_may_update_nodemask(memcg);

1947

node = memcg->last_scanned_node;

1947

node = memcg->last_scanned_node;

1948

1949

node = next_node(node, memcg->scan_nodes);

1949

node = next_node(node, memcg->scan_nodes);

1950

if (node == MAX_NUMNODES)

1950

if (node == MAX_NUMNODES)

1951

node = first_node(memcg->scan_nodes);

1951

node = first_node(memcg->scan_nodes);

1952

/*

1952

/*

1953

* We call this when we hit limit, not when pages are added to LRU.

1953

* We call this when we hit limit, not when pages are added to LRU.

1954

* No LRU may hold pages because all pages are UNEVICTABLE or

1954

* No LRU may hold pages because all pages are UNEVICTABLE or

1955

* memcg is too small and all pages are not on LRU. In that case,

1955

* memcg is too small and all pages are not on LRU. In that case,

1956

* we use curret node.

1956

* we use curret node.

1957

*/

1957

*/

1958

if (unlikely(node == MAX_NUMNODES))

1958

if (unlikely(node == MAX_NUMNODES))

1959

node = numa_node_id();

1959

node = numa_node_id();

1960

1961

memcg->last_scanned_node = node;

1961

memcg->last_scanned_node = node;

1962

return node;

1962

return node;

1963

}

1963

}

1964

1965

/*

1965

/*

1966

* Check all nodes whether it contains reclaimable pages or not.

1966

* Check all nodes whether it contains reclaimable pages or not.

1967

* For quick scan, we make use of scan_nodes. This will allow us to skip

1967

* For quick scan, we make use of scan_nodes. This will allow us to skip

1968

* unused nodes. But scan_nodes is lazily updated and may not cotain

1968

* unused nodes. But scan_nodes is lazily updated and may not cotain

1969

* enough new information. We need to do double check.

1969

* enough new information. We need to do double check.

1970

*/

1970

*/

1971

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1971

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1972

{

1972

{

1973

int nid;

1973

int nid;

1974

1975

/*

1975

/*

1976

* quick check...making use of scan_node.

1976

* quick check...making use of scan_node.

1977

* We can skip unused nodes.

1977

* We can skip unused nodes.

1978

*/

1978

*/

1979

if (!nodes_empty(memcg->scan_nodes)) {

1979

if (!nodes_empty(memcg->scan_nodes)) {

1980

for (nid = first_node(memcg->scan_nodes);

1980

for (nid = first_node(memcg->scan_nodes);

1981

nid < MAX_NUMNODES;

1981

nid < MAX_NUMNODES;

1982

nid = next_node(nid, memcg->scan_nodes)) {

1982

nid = next_node(nid, memcg->scan_nodes)) {

1983

1984

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1984

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1985

return true;

1985

return true;

1986

}

1986

}

1987

}

1987

}

1988

/*

1988

/*

1989

* Check rest of nodes.

1989

* Check rest of nodes.

1990

*/

1990

*/

1991

for_each_node_state(nid, N_MEMORY) {

1991

for_each_node_state(nid, N_MEMORY) {

1992

if (node_isset(nid, memcg->scan_nodes))

1992

if (node_isset(nid, memcg->scan_nodes))

1993

continue;

1993

continue;

1994

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1994

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1995

return true;

1995

return true;

1996

}

1996

}

1997

return false;

1997

return false;

1998

}

1998

}

1999

2000

#else

2000

#else

2001

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

2001

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

2002

{

2002

{

2003

return 0;

2003

return 0;

2004

}

2004

}

2005

2006

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

2006

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

2007

{

2007

{

2008

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

2008

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

2009

}

2009

}

2010

#endif

2010

#endif

2011

2012

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

2012

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

2013

struct zone *zone,

2013

struct zone *zone,

2014

gfp_t gfp_mask,

2014

gfp_t gfp_mask,

2015

unsigned long *total_scanned)

2015

unsigned long *total_scanned)

2016

{

2016

{

2017

struct mem_cgroup *victim = NULL;

2017

struct mem_cgroup *victim = NULL;

2018

int total = 0;

2018

int total = 0;

2019

int loop = 0;

2019

int loop = 0;

2020

unsigned long excess;

2020

unsigned long excess;

2021

unsigned long nr_scanned;

2021

unsigned long nr_scanned;

2022

struct mem_cgroup_reclaim_cookie reclaim = {

2022

struct mem_cgroup_reclaim_cookie reclaim = {

2023

.zone = zone,

2023

.zone = zone,

2024

.priority = 0,

2024

.priority = 0,

2025

};

2025

};

2026

2027

excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

2027

excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

2028

2029

while (1) {

2029

while (1) {

2030

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

2030

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

2031

if (!victim) {

2031

if (!victim) {

2032

loop++;

2032

loop++;

2033

if (loop >= 2) {

2033

if (loop >= 2) {

2034

/*

2034

/*

2035

* If we have not been able to reclaim

2035

* If we have not been able to reclaim

2036

* anything, it might because there are

2036

* anything, it might because there are

2037

* no reclaimable pages under this hierarchy

2037

* no reclaimable pages under this hierarchy

2038

*/

2038

*/

2039

if (!total)

2039

if (!total)

2040

break;

2040

break;

2041

/*

2041

/*

2042

* We want to do more targeted reclaim.

2042

* We want to do more targeted reclaim.

2043

* excess >> 2 is not to excessive so as to

2043

* excess >> 2 is not to excessive so as to

2044

* reclaim too much, nor too less that we keep

2044

* reclaim too much, nor too less that we keep

2045

* coming back to reclaim from this cgroup

2045

* coming back to reclaim from this cgroup

2046

*/

2046

*/

2047

if (total >= (excess >> 2) ||

2047

if (total >= (excess >> 2) ||

2048

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

2048

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

2049

break;

2049

break;

2050

}

2050

}

2051

continue;

2051

continue;

2052

}

2052

}

2053

if (!mem_cgroup_reclaimable(victim, false))

2053

if (!mem_cgroup_reclaimable(victim, false))

2054

continue;

2054

continue;

2055

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

2055

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

2056

zone, &nr_scanned);

2056

zone, &nr_scanned);

2057

*total_scanned += nr_scanned;

2057

*total_scanned += nr_scanned;

2058

if (!res_counter_soft_limit_excess(&root_memcg->res))

2058

if (!res_counter_soft_limit_excess(&root_memcg->res))

2059

break;

2059

break;

2060

}

2060

}

2061

mem_cgroup_iter_break(root_memcg, victim);

2061

mem_cgroup_iter_break(root_memcg, victim);

2062

return total;

2062

return total;

2063

}

2063

}

2064

2065

#ifdef CONFIG_LOCKDEP

2065

#ifdef CONFIG_LOCKDEP

2066

static struct lockdep_map memcg_oom_lock_dep_map = {

2066

static struct lockdep_map memcg_oom_lock_dep_map = {

2067

.name = "memcg_oom_lock",

2067

.name = "memcg_oom_lock",

2068

};

2068

};

2069

#endif

2069

#endif

2070

2071

static DEFINE_SPINLOCK(memcg_oom_lock);

2071

static DEFINE_SPINLOCK(memcg_oom_lock);

2072

2073

/*

2073

/*

2074

* Check OOM-Killer is already running under our hierarchy.

2074

* Check OOM-Killer is already running under our hierarchy.

2075

* If someone is running, return false.

2075

* If someone is running, return false.

2076

*/

2076

*/

2077

static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)

2077

static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)

2078

{

2078

{

2079

struct mem_cgroup *iter, *failed = NULL;

2079

struct mem_cgroup *iter, *failed = NULL;

2080

2081

spin_lock(&memcg_oom_lock);

2081

spin_lock(&memcg_oom_lock);

2082

2083

for_each_mem_cgroup_tree(iter, memcg) {

2083

for_each_mem_cgroup_tree(iter, memcg) {

2084

if (iter->oom_lock) {

2084

if (iter->oom_lock) {

2085

/*

2085

/*

2086

* this subtree of our hierarchy is already locked

2086

* this subtree of our hierarchy is already locked

2087

* so we cannot give a lock.

2087

* so we cannot give a lock.

2088

*/

2088

*/

2089

failed = iter;

2089

failed = iter;

2090

mem_cgroup_iter_break(memcg, iter);

2090

mem_cgroup_iter_break(memcg, iter);

2091

break;

2091

break;

2092

} else

2092

} else

2093

iter->oom_lock = true;

2093

iter->oom_lock = true;

2094

}

2094

}

2095

2096

if (failed) {

2096

if (failed) {

2097

/*

2097

/*

2098

* OK, we failed to lock the whole subtree so we have

2098

* OK, we failed to lock the whole subtree so we have

2099

* to clean up what we set up to the failing subtree

2099

* to clean up what we set up to the failing subtree

2100

*/

2100

*/

2101

for_each_mem_cgroup_tree(iter, memcg) {

2101

for_each_mem_cgroup_tree(iter, memcg) {

2102

if (iter == failed) {

2102

if (iter == failed) {

2103

mem_cgroup_iter_break(memcg, iter);

2103

mem_cgroup_iter_break(memcg, iter);

2104

break;

2104

break;

2105

}

2105

}

2106

iter->oom_lock = false;

2106

iter->oom_lock = false;

2107

}

2107

}

2108

} else

2108

} else

2109

mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);

2109

mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);

2110

2111

spin_unlock(&memcg_oom_lock);

2111

spin_unlock(&memcg_oom_lock);

2112

2113

return !failed;

2113

return !failed;

2114

}

2114

}

2115

2116

static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

2116

static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

2117

{

2117

{

2118

struct mem_cgroup *iter;

2118

struct mem_cgroup *iter;

2119

2120

spin_lock(&memcg_oom_lock);

2120

spin_lock(&memcg_oom_lock);

2121

mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);

2121

mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);

2122

for_each_mem_cgroup_tree(iter, memcg)

2122

for_each_mem_cgroup_tree(iter, memcg)

2123

iter->oom_lock = false;

2123

iter->oom_lock = false;

2124

spin_unlock(&memcg_oom_lock);

2124

spin_unlock(&memcg_oom_lock);

2125

}

2125

}

2126

2127

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

2127

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

2128

{

2128

{

2129

struct mem_cgroup *iter;

2129

struct mem_cgroup *iter;

2130

2131

for_each_mem_cgroup_tree(iter, memcg)

2131

for_each_mem_cgroup_tree(iter, memcg)

2132

atomic_inc(&iter->under_oom);

2132

atomic_inc(&iter->under_oom);

2133

}

2133

}

2134

2135

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

2135

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

2136

{

2136

{

2137

struct mem_cgroup *iter;

2137

struct mem_cgroup *iter;

2138

2139

/*

2139

/*

2140

* When a new child is created while the hierarchy is under oom,

2140

* When a new child is created while the hierarchy is under oom,

2141

* mem_cgroup_oom_lock() may not be called. We have to use

2141

* mem_cgroup_oom_lock() may not be called. We have to use

2142

* atomic_add_unless() here.

2142

* atomic_add_unless() here.

2143

*/

2143

*/

2144

for_each_mem_cgroup_tree(iter, memcg)

2144

for_each_mem_cgroup_tree(iter, memcg)

2145

atomic_add_unless(&iter->under_oom, -1, 0);

2145

atomic_add_unless(&iter->under_oom, -1, 0);

2146

}

2146

}

2147

2148

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

2148

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

2149

2150

struct oom_wait_info {

2150

struct oom_wait_info {

2151

struct mem_cgroup *memcg;

2151

struct mem_cgroup *memcg;

2152

wait_queue_t wait;

2152

wait_queue_t wait;

2153

};

2153

};

2154

2155

static int memcg_oom_wake_function(wait_queue_t *wait,

2155

static int memcg_oom_wake_function(wait_queue_t *wait,

2156

unsigned mode, int sync, void *arg)

2156

unsigned mode, int sync, void *arg)

2157

{

2157

{

2158

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

2158

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

2159

struct mem_cgroup *oom_wait_memcg;

2159

struct mem_cgroup *oom_wait_memcg;

2160

struct oom_wait_info *oom_wait_info;

2160

struct oom_wait_info *oom_wait_info;

2161

2162

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

2162

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

2163

oom_wait_memcg = oom_wait_info->memcg;

2163

oom_wait_memcg = oom_wait_info->memcg;

2164

2165

/*

2165

/*

2166

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

2166

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

2167

* Then we can use css_is_ancestor without taking care of RCU.

2167

* Then we can use css_is_ancestor without taking care of RCU.

2168

*/

2168

*/

2169

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

2169

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

2170

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

2170

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

2171

return 0;

2171

return 0;

2172

return autoremove_wake_function(wait, mode, sync, arg);

2172

return autoremove_wake_function(wait, mode, sync, arg);

2173

}

2173

}

2174

2175

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

2175

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

2176

{

2176

{

2177

atomic_inc(&memcg->oom_wakeups);

2177

atomic_inc(&memcg->oom_wakeups);

2178

/* for filtering, pass "memcg" as argument. */

2178

/* for filtering, pass "memcg" as argument. */

2179

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

2179

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

2180

}

2180

}

2181

2182

static void memcg_oom_recover(struct mem_cgroup *memcg)

2182

static void memcg_oom_recover(struct mem_cgroup *memcg)

2183

{

2183

{

2184

if (memcg && atomic_read(&memcg->under_oom))

2184

if (memcg && atomic_read(&memcg->under_oom))

2185

memcg_wakeup_oom(memcg);

2185

memcg_wakeup_oom(memcg);

2186

}

2186

}

2187

2188

static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)

2188

static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)

2189

{

2189

{

2190

if (!current->memcg_oom.may_oom)

2190

if (!current->memcg_oom.may_oom)

2191

return;

2191

return;

2192

/*

2192

/*

2193

* We are in the middle of the charge context here, so we

2193

* We are in the middle of the charge context here, so we

2194

* don't want to block when potentially sitting on a callstack

2194

* don't want to block when potentially sitting on a callstack

2195

* that holds all kinds of filesystem and mm locks.

2195

* that holds all kinds of filesystem and mm locks.

2196

*

2196

*

2197

* Also, the caller may handle a failed allocation gracefully

2197

* Also, the caller may handle a failed allocation gracefully

2198

* (like optional page cache readahead) and so an OOM killer

2198

* (like optional page cache readahead) and so an OOM killer

2199

* invocation might not even be necessary.

2199

* invocation might not even be necessary.

2200

*

2200

*

2201

* That's why we don't do anything here except remember the

2201

* That's why we don't do anything here except remember the

2202

* OOM context and then deal with it at the end of the page

2202

* OOM context and then deal with it at the end of the page

2203

* fault when the stack is unwound, the locks are released,

2203

* fault when the stack is unwound, the locks are released,

2204

* and when we know whether the fault was overall successful.

2204

* and when we know whether the fault was overall successful.

2205

*/

2205

*/

2206

css_get(&memcg->css);

2206

css_get(&memcg->css);

2207

current->memcg_oom.memcg = memcg;

2207

current->memcg_oom.memcg = memcg;

2208

current->memcg_oom.gfp_mask = mask;

2208

current->memcg_oom.gfp_mask = mask;

2209

current->memcg_oom.order = order;

2209

current->memcg_oom.order = order;

2210

}

2210

}

2211

2212

/**

2212

/**

2213

* mem_cgroup_oom_synchronize - complete memcg OOM handling

2213

* mem_cgroup_oom_synchronize - complete memcg OOM handling

2214

* @handle: actually kill/wait or just clean up the OOM state

2214

* @handle: actually kill/wait or just clean up the OOM state

2215

*

2215

*

2216

* This has to be called at the end of a page fault if the memcg OOM

2216

* This has to be called at the end of a page fault if the memcg OOM

2217

* handler was enabled.

2217

* handler was enabled.

2218

*

2218

*

2219

* Memcg supports userspace OOM handling where failed allocations must

2219

* Memcg supports userspace OOM handling where failed allocations must

2220

* sleep on a waitqueue until the userspace task resolves the

2220

* sleep on a waitqueue until the userspace task resolves the

2221

* situation. Sleeping directly in the charge context with all kinds

2221

* situation. Sleeping directly in the charge context with all kinds

2222

* of locks held is not a good idea, instead we remember an OOM state

2222

* of locks held is not a good idea, instead we remember an OOM state

2223

* in the task and mem_cgroup_oom_synchronize() has to be called at

2223

* in the task and mem_cgroup_oom_synchronize() has to be called at

2224

* the end of the page fault to complete the OOM handling.

2224

* the end of the page fault to complete the OOM handling.

2225

*

2225

*

2226

* Returns %true if an ongoing memcg OOM situation was detected and

2226

* Returns %true if an ongoing memcg OOM situation was detected and

2227

* completed, %false otherwise.

2227

* completed, %false otherwise.

2228

*/

2228

*/

2229

bool mem_cgroup_oom_synchronize(bool handle)

2229

bool mem_cgroup_oom_synchronize(bool handle)

2230

{

2230

{

2231

struct mem_cgroup *memcg = current->memcg_oom.memcg;

2231

struct mem_cgroup *memcg = current->memcg_oom.memcg;

2232

struct oom_wait_info owait;

2232

struct oom_wait_info owait;

2233

bool locked;

2233

bool locked;

2234

2235

/* OOM is global, do not handle */

2235

/* OOM is global, do not handle */

2236

if (!memcg)

2236

if (!memcg)

2237

return false;

2237

return false;

2238

2239

if (!handle)

2239

if (!handle)

2240

goto cleanup;

2240

goto cleanup;

2241

2242

owait.memcg = memcg;

2242

owait.memcg = memcg;

2243

owait.wait.flags = 0;

2243

owait.wait.flags = 0;

2244

owait.wait.func = memcg_oom_wake_function;

2244

owait.wait.func = memcg_oom_wake_function;

2245

owait.wait.private = current;

2245

owait.wait.private = current;

2246

INIT_LIST_HEAD(&owait.wait.task_list);

2246

INIT_LIST_HEAD(&owait.wait.task_list);

2247

2248

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

2248

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

2249

mem_cgroup_mark_under_oom(memcg);

2249

mem_cgroup_mark_under_oom(memcg);

2250

2251

locked = mem_cgroup_oom_trylock(memcg);

2251

locked = mem_cgroup_oom_trylock(memcg);

2252

2253

if (locked)

2253

if (locked)

2254

mem_cgroup_oom_notify(memcg);

2254

mem_cgroup_oom_notify(memcg);

2255

2256

if (locked && !memcg->oom_kill_disable) {

2256

if (locked && !memcg->oom_kill_disable) {

2257

mem_cgroup_unmark_under_oom(memcg);

2257

mem_cgroup_unmark_under_oom(memcg);

2258

finish_wait(&memcg_oom_waitq, &owait.wait);

2258

finish_wait(&memcg_oom_waitq, &owait.wait);

2259

mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,

2259

mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,

2260

current->memcg_oom.order);

2260

current->memcg_oom.order);

2261

} else {

2261

} else {

2262

schedule();

2262

schedule();

2263

mem_cgroup_unmark_under_oom(memcg);

2263

mem_cgroup_unmark_under_oom(memcg);

2264

finish_wait(&memcg_oom_waitq, &owait.wait);

2264

finish_wait(&memcg_oom_waitq, &owait.wait);

2265

}

2265

}

2266

2267

if (locked) {

2267

if (locked) {

2268

mem_cgroup_oom_unlock(memcg);

2268

mem_cgroup_oom_unlock(memcg);

2269

/*

2269

/*

2270

* There is no guarantee that an OOM-lock contender

2270

* There is no guarantee that an OOM-lock contender

2271

* sees the wakeups triggered by the OOM kill

2271

* sees the wakeups triggered by the OOM kill

2272

* uncharges. Wake any sleepers explicitely.

2272

* uncharges. Wake any sleepers explicitely.

2273

*/

2273

*/

2274

memcg_oom_recover(memcg);

2274

memcg_oom_recover(memcg);

2275

}

2275

}

2276

cleanup:

2276

cleanup:

2277

current->memcg_oom.memcg = NULL;

2277

current->memcg_oom.memcg = NULL;

2278

css_put(&memcg->css);

2278

css_put(&memcg->css);

2279

return true;

2279

return true;

2280

}

2280

}

2281

2282

/*

2282

/*

2283

* Currently used to update mapped file statistics, but the routine can be

2283

* Currently used to update mapped file statistics, but the routine can be

2284

* generalized to update other statistics as well.

2284

* generalized to update other statistics as well.

2285

*

2285

*

2286

* Notes: Race condition

2286

* Notes: Race condition

2287

*

2287

*

2288

* We usually use page_cgroup_lock() for accessing page_cgroup member but

2288

* We usually use page_cgroup_lock() for accessing page_cgroup member but

2289

* it tends to be costly. But considering some conditions, we doesn't need

2289

* it tends to be costly. But considering some conditions, we doesn't need

2290

* to do so _always_.

2290

* to do so _always_.

2291

*

2291

*

2292

* Considering "charge", lock_page_cgroup() is not required because all

2292

* Considering "charge", lock_page_cgroup() is not required because all

2293

* file-stat operations happen after a page is attached to radix-tree. There

2293

* file-stat operations happen after a page is attached to radix-tree. There

2294

* are no race with "charge".

2294

* are no race with "charge".

2295

*

2295

*

2296

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

2296

* Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup

2297

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

2297

* at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even

2298

* if there are race with "uncharge". Statistics itself is properly handled

2298

* if there are race with "uncharge". Statistics itself is properly handled

2299

* by flags.

2299

* by flags.

2300

*

2300

*

2301

* Considering "move", this is an only case we see a race. To make the race

2301

* Considering "move", this is an only case we see a race. To make the race

2302

* small, we check mm->moving_account and detect there are possibility of race

2302

* small, we check mm->moving_account and detect there are possibility of race

2303

* If there is, we take a lock.

2303

* If there is, we take a lock.

2304

*/

2304

*/

2305

2306

void __mem_cgroup_begin_update_page_stat(struct page *page,

2306

void __mem_cgroup_begin_update_page_stat(struct page *page,

2307

bool *locked, unsigned long *flags)

2307

bool *locked, unsigned long *flags)

2308

{

2308

{

2309

struct mem_cgroup *memcg;

2309

struct mem_cgroup *memcg;

2310

struct page_cgroup *pc;

2310

struct page_cgroup *pc;

2311

2312

pc = lookup_page_cgroup(page);

2312

pc = lookup_page_cgroup(page);

2313

again:

2313

again:

2314

memcg = pc->mem_cgroup;

2314

memcg = pc->mem_cgroup;

2315

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2315

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2316

return;

2316

return;

2317

/*

2317

/*

2318

* If this memory cgroup is not under account moving, we don't

2318

* If this memory cgroup is not under account moving, we don't

2319

* need to take move_lock_mem_cgroup(). Because we already hold

2319

* need to take move_lock_mem_cgroup(). Because we already hold

2320

* rcu_read_lock(), any calls to move_account will be delayed until

2320

* rcu_read_lock(), any calls to move_account will be delayed until

2321

* rcu_read_unlock() if mem_cgroup_stolen() == true.

2321

* rcu_read_unlock() if mem_cgroup_stolen() == true.

2322

*/

2322

*/

2323

if (!mem_cgroup_stolen(memcg))

2323

if (!mem_cgroup_stolen(memcg))

2324

return;

2324

return;

2325

2326

move_lock_mem_cgroup(memcg, flags);

2326

move_lock_mem_cgroup(memcg, flags);

2327

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

2327

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

2328

move_unlock_mem_cgroup(memcg, flags);

2328

move_unlock_mem_cgroup(memcg, flags);

2329

goto again;

2329

goto again;

2330

}

2330

}

2331

*locked = true;

2331

*locked = true;

2332

}

2332

}

2333

2334

void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)

2334

void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)

2335

{

2335

{

2336

struct page_cgroup *pc = lookup_page_cgroup(page);

2336

struct page_cgroup *pc = lookup_page_cgroup(page);

2337

2338

/*

2338

/*

2339

* It's guaranteed that pc->mem_cgroup never changes while

2339

* It's guaranteed that pc->mem_cgroup never changes while

2340

* lock is held because a routine modifies pc->mem_cgroup

2340

* lock is held because a routine modifies pc->mem_cgroup

2341

* should take move_lock_mem_cgroup().

2341

* should take move_lock_mem_cgroup().

2342

*/

2342

*/

2343

move_unlock_mem_cgroup(pc->mem_cgroup, flags);

2343

move_unlock_mem_cgroup(pc->mem_cgroup, flags);

2344

}

2344

}

2345

2346

void mem_cgroup_update_page_stat(struct page *page,

2346

void mem_cgroup_update_page_stat(struct page *page,

2347

enum mem_cgroup_stat_index idx, int val)

2347

enum mem_cgroup_stat_index idx, int val)

2348

{

2348

{

2349

struct mem_cgroup *memcg;

2349

struct mem_cgroup *memcg;

2350

struct page_cgroup *pc = lookup_page_cgroup(page);

2350

struct page_cgroup *pc = lookup_page_cgroup(page);

2351

unsigned long uninitialized_var(flags);

2351

unsigned long uninitialized_var(flags);

2352

2353

if (mem_cgroup_disabled())

2353

if (mem_cgroup_disabled())

2354

return;

2354

return;

2355

2356

VM_BUG_ON(!rcu_read_lock_held());

2356

VM_BUG_ON(!rcu_read_lock_held());

2357

memcg = pc->mem_cgroup;

2357

memcg = pc->mem_cgroup;

2358

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2358

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2359

return;

2359

return;

2360

2361

this_cpu_add(memcg->stat->count[idx], val);

2361

this_cpu_add(memcg->stat->count[idx], val);

2362

}

2362

}

2363

2364

/*

2364

/*

2365

* size of first charge trial. "32" comes from vmscan.c's magic value.

2365

* size of first charge trial. "32" comes from vmscan.c's magic value.

2366

* TODO: maybe necessary to use big numbers in big irons.

2366

* TODO: maybe necessary to use big numbers in big irons.

2367

*/

2367

*/

2368

#define CHARGE_BATCH 32U

2368

#define CHARGE_BATCH 32U

2369

struct memcg_stock_pcp {

2369

struct memcg_stock_pcp {

2370

struct mem_cgroup *cached; /* this never be root cgroup */

2370

struct mem_cgroup *cached; /* this never be root cgroup */

2371

unsigned int nr_pages;

2371

unsigned int nr_pages;

2372

struct work_struct work;

2372

struct work_struct work;

2373

unsigned long flags;

2373

unsigned long flags;

2374

#define FLUSHING_CACHED_CHARGE 0

2374

#define FLUSHING_CACHED_CHARGE 0

2375

};

2375

};

2376

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2376

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2377

static DEFINE_MUTEX(percpu_charge_mutex);

2377

static DEFINE_MUTEX(percpu_charge_mutex);

2378

2379

/**

2379

/**

2380

* consume_stock: Try to consume stocked charge on this cpu.

2380

* consume_stock: Try to consume stocked charge on this cpu.

2381

* @memcg: memcg to consume from.

2381

* @memcg: memcg to consume from.

2382

* @nr_pages: how many pages to charge.

2382

* @nr_pages: how many pages to charge.

2383

*

2383

*

2384

* The charges will only happen if @memcg matches the current cpu's memcg

2384

* The charges will only happen if @memcg matches the current cpu's memcg

2385

* stock, and at least @nr_pages are available in that stock. Failure to

2385

* stock, and at least @nr_pages are available in that stock. Failure to

2386

* service an allocation will refill the stock.

2386

* service an allocation will refill the stock.

2387

*

2387

*

2388

* returns true if successful, false otherwise.

2388

* returns true if successful, false otherwise.

2389

*/

2389

*/

2390

static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2390

static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2391

{

2391

{

2392

struct memcg_stock_pcp *stock;

2392

struct memcg_stock_pcp *stock;

2393

bool ret = true;

2393

bool ret = true;

2394

2395

if (nr_pages > CHARGE_BATCH)

2395

if (nr_pages > CHARGE_BATCH)

2396

return false;

2396

return false;

2397

2398

stock = &get_cpu_var(memcg_stock);

2398

stock = &get_cpu_var(memcg_stock);

2399

if (memcg == stock->cached && stock->nr_pages >= nr_pages)

2399

if (memcg == stock->cached && stock->nr_pages >= nr_pages)

2400

stock->nr_pages -= nr_pages;

2400

stock->nr_pages -= nr_pages;

2401

else /* need to call res_counter_charge */

2401

else /* need to call res_counter_charge */

2402

ret = false;

2402

ret = false;

2403

put_cpu_var(memcg_stock);

2403

put_cpu_var(memcg_stock);

2404

return ret;

2404

return ret;

2405

}

2405

}

2406

2407

/*

2407

/*

2408

* Returns stocks cached in percpu to res_counter and reset cached information.

2408

* Returns stocks cached in percpu to res_counter and reset cached information.

2409

*/

2409

*/

2410

static void drain_stock(struct memcg_stock_pcp *stock)

2410

static void drain_stock(struct memcg_stock_pcp *stock)

2411

{

2411

{

2412

struct mem_cgroup *old = stock->cached;

2412

struct mem_cgroup *old = stock->cached;

2413

2414

if (stock->nr_pages) {

2414

if (stock->nr_pages) {

2415

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2415

unsigned long bytes = stock->nr_pages * PAGE_SIZE;

2416

2417

res_counter_uncharge(&old->res, bytes);

2417

res_counter_uncharge(&old->res, bytes);

2418

if (do_swap_account)

2418

if (do_swap_account)

2419

res_counter_uncharge(&old->memsw, bytes);

2419

res_counter_uncharge(&old->memsw, bytes);

2420

stock->nr_pages = 0;

2420

stock->nr_pages = 0;

2421

}

2421

}

2422

stock->cached = NULL;

2422

stock->cached = NULL;

2423

}

2423

}

2424

2425

/*

2425

/*

2426

* This must be called under preempt disabled or must be called by

2426

* This must be called under preempt disabled or must be called by

2427

* a thread which is pinned to local cpu.

2427

* a thread which is pinned to local cpu.

2428

*/

2428

*/

2429

static void drain_local_stock(struct work_struct *dummy)

2429

static void drain_local_stock(struct work_struct *dummy)

2430

{

2430

{

2431

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2431

struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);

2432

drain_stock(stock);

2432

drain_stock(stock);

2433

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2433

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2434

}

2434

}

2435

2436

static void __init memcg_stock_init(void)

2436

static void __init memcg_stock_init(void)

2437

{

2437

{

2438

int cpu;

2438

int cpu;

2439

2440

for_each_possible_cpu(cpu) {

2440

for_each_possible_cpu(cpu) {

2441

struct memcg_stock_pcp *stock =

2441

struct memcg_stock_pcp *stock =

2442

&per_cpu(memcg_stock, cpu);

2442

&per_cpu(memcg_stock, cpu);

2443

INIT_WORK(&stock->work, drain_local_stock);

2443

INIT_WORK(&stock->work, drain_local_stock);

2444

}

2444

}

2445

}

2445

}

2446

2447

/*

2447

/*

2448

* Cache charges(val) which is from res_counter, to local per_cpu area.

2448

* Cache charges(val) which is from res_counter, to local per_cpu area.

2449

* This will be consumed by consume_stock() function, later.

2449

* This will be consumed by consume_stock() function, later.

2450

*/

2450

*/

2451

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2451

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2452

{

2452

{

2453

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2453

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2454

2455

if (stock->cached != memcg) { /* reset if necessary */

2455

if (stock->cached != memcg) { /* reset if necessary */

2456

drain_stock(stock);

2456

drain_stock(stock);

2457

stock->cached = memcg;

2457

stock->cached = memcg;

2458

}

2458

}

2459

stock->nr_pages += nr_pages;

2459

stock->nr_pages += nr_pages;

2460

put_cpu_var(memcg_stock);

2460

put_cpu_var(memcg_stock);

2461

}

2461

}

2462

2463

/*

2463

/*

2464

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2464

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2465

* of the hierarchy under it. sync flag says whether we should block

2465

* of the hierarchy under it. sync flag says whether we should block

2466

* until the work is done.

2466

* until the work is done.

2467

*/

2467

*/

2468

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2468

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2469

{

2469

{

2470

int cpu, curcpu;

2470

int cpu, curcpu;

2471

2472

/* Notify other cpus that system-wide "drain" is running */

2472

/* Notify other cpus that system-wide "drain" is running */

2473

get_online_cpus();

2473

get_online_cpus();

2474

curcpu = get_cpu();

2474

curcpu = get_cpu();

2475

for_each_online_cpu(cpu) {

2475

for_each_online_cpu(cpu) {

2476

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2476

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2477

struct mem_cgroup *memcg;

2477

struct mem_cgroup *memcg;

2478

2479

memcg = stock->cached;

2479

memcg = stock->cached;

2480

if (!memcg || !stock->nr_pages)

2480

if (!memcg || !stock->nr_pages)

2481

continue;

2481

continue;

2482

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2482

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2483

continue;

2483

continue;

2484

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2484

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2485

if (cpu == curcpu)

2485

if (cpu == curcpu)

2486

drain_local_stock(&stock->work);

2486

drain_local_stock(&stock->work);

2487

else

2487

else

2488

schedule_work_on(cpu, &stock->work);

2488

schedule_work_on(cpu, &stock->work);

2489

}

2489

}

2490

}

2490

}

2491

put_cpu();

2491

put_cpu();

2492

2493

if (!sync)

2493

if (!sync)

2494

goto out;

2494

goto out;

2495

2496

for_each_online_cpu(cpu) {

2496

for_each_online_cpu(cpu) {

2497

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2497

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2498

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2498

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2499

flush_work(&stock->work);

2499

flush_work(&stock->work);

2500

}

2500

}

2501

out:

2501

out:

2502

put_online_cpus();

2502

put_online_cpus();

2503

}

2503

}

2504

2505

/*

2505

/*

2506

* Tries to drain stocked charges in other cpus. This function is asynchronous

2506

* Tries to drain stocked charges in other cpus. This function is asynchronous

2507

* and just put a work per cpu for draining localy on each cpu. Caller can

2507

* and just put a work per cpu for draining localy on each cpu. Caller can

2508

* expects some charges will be back to res_counter later but cannot wait for

2508

* expects some charges will be back to res_counter later but cannot wait for

2509

* it.

2509

* it.

2510

*/

2510

*/

2511

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2511

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2512

{

2512

{

2513

/*

2513

/*

2514

* If someone calls draining, avoid adding more kworker runs.

2514

* If someone calls draining, avoid adding more kworker runs.

2515

*/

2515

*/

2516

if (!mutex_trylock(&percpu_charge_mutex))

2516

if (!mutex_trylock(&percpu_charge_mutex))

2517

return;

2517

return;

2518

drain_all_stock(root_memcg, false);

2518

drain_all_stock(root_memcg, false);

2519

mutex_unlock(&percpu_charge_mutex);

2519

mutex_unlock(&percpu_charge_mutex);

2520

}

2520

}

2521

2522

/* This is a synchronous drain interface. */

2522

/* This is a synchronous drain interface. */

2523

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2523

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2524

{

2524

{

2525

/* called when force_empty is called */

2525

/* called when force_empty is called */

2526

mutex_lock(&percpu_charge_mutex);

2526

mutex_lock(&percpu_charge_mutex);

2527

drain_all_stock(root_memcg, true);

2527

drain_all_stock(root_memcg, true);

2528

mutex_unlock(&percpu_charge_mutex);

2528

mutex_unlock(&percpu_charge_mutex);

2529

}

2529

}

2530

2531

/*

2531

/*

2532

* This function drains percpu counter value from DEAD cpu and

2532

* This function drains percpu counter value from DEAD cpu and

2533

* move it to local cpu. Note that this function can be preempted.

2533

* move it to local cpu. Note that this function can be preempted.

2534

*/

2534

*/

2535

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2535

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2536

{

2536

{

2537

int i;

2537

int i;

2538

2539

spin_lock(&memcg->pcp_counter_lock);

2539

spin_lock(&memcg->pcp_counter_lock);

2540

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2540

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2541

long x = per_cpu(memcg->stat->count[i], cpu);

2541

long x = per_cpu(memcg->stat->count[i], cpu);

2542

2543

per_cpu(memcg->stat->count[i], cpu) = 0;

2543

per_cpu(memcg->stat->count[i], cpu) = 0;

2544

memcg->nocpu_base.count[i] += x;

2544

memcg->nocpu_base.count[i] += x;

2545

}

2545

}

2546

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2546

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2547

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2547

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2548

2549

per_cpu(memcg->stat->events[i], cpu) = 0;

2549

per_cpu(memcg->stat->events[i], cpu) = 0;

2550

memcg->nocpu_base.events[i] += x;

2550

memcg->nocpu_base.events[i] += x;

2551

}

2551

}

2552

spin_unlock(&memcg->pcp_counter_lock);

2552

spin_unlock(&memcg->pcp_counter_lock);

2553

}

2553

}

2554

2555

static int memcg_cpu_hotplug_callback(struct notifier_block *nb,

2555

static int memcg_cpu_hotplug_callback(struct notifier_block *nb,

2556

unsigned long action,

2556

unsigned long action,

2557

void *hcpu)

2557

void *hcpu)

2558

{

2558

{

2559

int cpu = (unsigned long)hcpu;

2559

int cpu = (unsigned long)hcpu;

2560

struct memcg_stock_pcp *stock;

2560

struct memcg_stock_pcp *stock;

2561

struct mem_cgroup *iter;

2561

struct mem_cgroup *iter;

2562

2563

if (action == CPU_ONLINE)

2563

if (action == CPU_ONLINE)

2564

return NOTIFY_OK;

2564

return NOTIFY_OK;

2565

2566

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2566

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2567

return NOTIFY_OK;

2567

return NOTIFY_OK;

2568

2569

for_each_mem_cgroup(iter)

2569

for_each_mem_cgroup(iter)

2570

mem_cgroup_drain_pcp_counter(iter, cpu);

2570

mem_cgroup_drain_pcp_counter(iter, cpu);

2571

2572

stock = &per_cpu(memcg_stock, cpu);

2572

stock = &per_cpu(memcg_stock, cpu);

2573

drain_stock(stock);

2573

drain_stock(stock);

2574

return NOTIFY_OK;

2574

return NOTIFY_OK;

2575

}

2575

}

2576

2577

2578

/* See mem_cgroup_try_charge() for details */

2578

/* See mem_cgroup_try_charge() for details */

2579

enum {

2579

enum {

2580

CHARGE_OK, /* success */

2580

CHARGE_OK, /* success */

2581

CHARGE_RETRY, /* need to retry but retry is not bad */

2581

CHARGE_RETRY, /* need to retry but retry is not bad */

2582

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2582

CHARGE_NOMEM, /* we can't do more. return -ENOMEM */

2583

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2583

CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */

2584

};

2584

};

2585

2586

static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2586

static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2587

unsigned int nr_pages, unsigned int min_pages,

2587

unsigned int nr_pages, unsigned int min_pages,

2588

bool invoke_oom)

2588

bool invoke_oom)

2589

{

2589

{

2590

unsigned long csize = nr_pages * PAGE_SIZE;

2590

unsigned long csize = nr_pages * PAGE_SIZE;

2591

struct mem_cgroup *mem_over_limit;

2591

struct mem_cgroup *mem_over_limit;

2592

struct res_counter *fail_res;

2592

struct res_counter *fail_res;

2593

unsigned long flags = 0;

2593

unsigned long flags = 0;

2594

int ret;

2594

int ret;

2595

2596

ret = res_counter_charge(&memcg->res, csize, &fail_res);

2596

ret = res_counter_charge(&memcg->res, csize, &fail_res);

2597

2598

if (likely(!ret)) {

2598

if (likely(!ret)) {

2599

if (!do_swap_account)

2599

if (!do_swap_account)

2600

return CHARGE_OK;

2600

return CHARGE_OK;

2601

ret = res_counter_charge(&memcg->memsw, csize, &fail_res);

2601

ret = res_counter_charge(&memcg->memsw, csize, &fail_res);

2602

if (likely(!ret))

2602

if (likely(!ret))

2603

return CHARGE_OK;

2603

return CHARGE_OK;

2604

2605

res_counter_uncharge(&memcg->res, csize);

2605

res_counter_uncharge(&memcg->res, csize);

2606

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2606

mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);

2607

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2607

flags |= MEM_CGROUP_RECLAIM_NOSWAP;

2608

} else

2608

} else

2609

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2609

mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);

2610

/*

2610

/*

2611

* Never reclaim on behalf of optional batching, retry with a

2611

* Never reclaim on behalf of optional batching, retry with a

2612

* single page instead.

2612

* single page instead.

2613

*/

2613

*/

2614

if (nr_pages > min_pages)

2614

if (nr_pages > min_pages)

2615

return CHARGE_RETRY;

2615

return CHARGE_RETRY;

2616

2617

if (!(gfp_mask & __GFP_WAIT))

2617

if (!(gfp_mask & __GFP_WAIT))

2618

return CHARGE_WOULDBLOCK;

2618

return CHARGE_WOULDBLOCK;

2619

2620

if (gfp_mask & __GFP_NORETRY)

2620

if (gfp_mask & __GFP_NORETRY)

2621

return CHARGE_NOMEM;

2621

return CHARGE_NOMEM;

2622

2623

ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);

2623

ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);

2624

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2624

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2625

return CHARGE_RETRY;

2625

return CHARGE_RETRY;

2626

/*

2626

/*

2627

* Even though the limit is exceeded at this point, reclaim

2627

* Even though the limit is exceeded at this point, reclaim

2628

* may have been able to free some pages. Retry the charge

2628

* may have been able to free some pages. Retry the charge

2629

* before killing the task.

2629

* before killing the task.

2630

*

2630

*

2631

* Only for regular pages, though: huge pages are rather

2631

* Only for regular pages, though: huge pages are rather

2632

* unlikely to succeed so close to the limit, and we fall back

2632

* unlikely to succeed so close to the limit, and we fall back

2633

* to regular pages anyway in case of failure.

2633

* to regular pages anyway in case of failure.

2634

*/

2634

*/

2635

if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)

2635

if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)

2636

return CHARGE_RETRY;

2636

return CHARGE_RETRY;

2637

2638

/*

2638

/*

2639

* At task move, charge accounts can be doubly counted. So, it's

2639

* At task move, charge accounts can be doubly counted. So, it's

2640

* better to wait until the end of task_move if something is going on.

2640

* better to wait until the end of task_move if something is going on.

2641

*/

2641

*/

2642

if (mem_cgroup_wait_acct_move(mem_over_limit))

2642

if (mem_cgroup_wait_acct_move(mem_over_limit))

2643

return CHARGE_RETRY;

2643

return CHARGE_RETRY;

2644

2645

if (invoke_oom)

2645

if (invoke_oom)

2646

mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));

2646

mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));

2647

2648

return CHARGE_NOMEM;

2648

return CHARGE_NOMEM;

2649

}

2649

}

2650

2651

/**

2651

/**

2652

* mem_cgroup_try_charge - try charging a memcg

2652

* mem_cgroup_try_charge - try charging a memcg

2653

* @memcg: memcg to charge

2653

* @memcg: memcg to charge

2654

* @nr_pages: number of pages to charge

2654

* @nr_pages: number of pages to charge

2655

* @oom: trigger OOM if reclaim fails

2655

* @oom: trigger OOM if reclaim fails

2656

*

2656

*

2657

* Returns 0 if @memcg was charged successfully, -EINTR if the charge

2657

* Returns 0 if @memcg was charged successfully, -EINTR if the charge

2658

* was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.

2658

* was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.

2659

*/

2659

*/

2660

static int mem_cgroup_try_charge(struct mem_cgroup *memcg,

2660

static int mem_cgroup_try_charge(struct mem_cgroup *memcg,

2661

gfp_t gfp_mask,

2661

gfp_t gfp_mask,

2662

unsigned int nr_pages,

2662

unsigned int nr_pages,

2663

bool oom)

2663

bool oom)

2664

{

2664

{

2665

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2665

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2666

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2666

int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;

2667

int ret;

2667

int ret;

2668

2669

if (mem_cgroup_is_root(memcg))

2669

if (mem_cgroup_is_root(memcg))

2670

goto done;

2670

goto done;

2671

/*

2671

/*

2672

* Unlike in global OOM situations, memcg is not in a physical

2672

* Unlike in global OOM situations, memcg is not in a physical

2673

* memory shortage. Allow dying and OOM-killed tasks to

2673

* memory shortage. Allow dying and OOM-killed tasks to

2674

* bypass the last charges so that they can exit quickly and

2674

* bypass the last charges so that they can exit quickly and

2675

* free their memory.

2675

* free their memory.

2676

*/

2676

*/

2677

if (unlikely(test_thread_flag(TIF_MEMDIE) ||

2677

if (unlikely(test_thread_flag(TIF_MEMDIE) ||

2678

fatal_signal_pending(current)))

2678

fatal_signal_pending(current)))

2679

goto bypass;

2679

goto bypass;

2680

2681

if (unlikely(task_in_memcg_oom(current)))

2681

if (unlikely(task_in_memcg_oom(current)))

2682

goto nomem;

2682

goto nomem;

2683

2684

if (gfp_mask & __GFP_NOFAIL)

2684

if (gfp_mask & __GFP_NOFAIL)

2685

oom = false;

2685

oom = false;

2686

again:

2686

again:

2687

if (consume_stock(memcg, nr_pages))

2687

if (consume_stock(memcg, nr_pages))

2688

goto done;

2688

goto done;

2689

2690

do {

2690

do {

2691

bool invoke_oom = oom && !nr_oom_retries;

2691

bool invoke_oom = oom && !nr_oom_retries;

2692

2693

/* If killed, bypass charge */

2693

/* If killed, bypass charge */

2694

if (fatal_signal_pending(current))

2694

if (fatal_signal_pending(current))

2695

goto bypass;

2695

goto bypass;

2696

2697

ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,

2697

ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,

2698

nr_pages, invoke_oom);

2698

nr_pages, invoke_oom);

2699

switch (ret) {

2699

switch (ret) {

2700

case CHARGE_OK:

2700

case CHARGE_OK:

2701

break;

2701

break;

2702

case CHARGE_RETRY: /* not in OOM situation but retry */

2702

case CHARGE_RETRY: /* not in OOM situation but retry */

2703

batch = nr_pages;

2703

batch = nr_pages;

2704

goto again;

2704

goto again;

2705

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2705

case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */

2706

goto nomem;

2706

goto nomem;

2707

case CHARGE_NOMEM: /* OOM routine works */

2707

case CHARGE_NOMEM: /* OOM routine works */

2708

if (!oom || invoke_oom)

2708

if (!oom || invoke_oom)

2709

goto nomem;

2709

goto nomem;

2710

nr_oom_retries--;

2710

nr_oom_retries--;

2711

break;

2711

break;

2712

}

2712

}

2713

} while (ret != CHARGE_OK);

2713

} while (ret != CHARGE_OK);

2714

2715

if (batch > nr_pages)

2715

if (batch > nr_pages)

2716

refill_stock(memcg, batch - nr_pages);

2716

refill_stock(memcg, batch - nr_pages);

2717

done:

2717

done:

2718

return 0;

2718

return 0;

2719

nomem:

2719

nomem:

2720

if (!(gfp_mask & __GFP_NOFAIL))

2720

if (!(gfp_mask & __GFP_NOFAIL))

2721

return -ENOMEM;

2721

return -ENOMEM;

2722

bypass:

2722

bypass:

2723

return -EINTR;

2723

return -EINTR;

2724

}

2724

}

2725

2726

/**

2726

/**

2727

* mem_cgroup_try_charge_mm - try charging a mm

2727

* mem_cgroup_try_charge_mm - try charging a mm

2728

* @mm: mm_struct to charge

2728

* @mm: mm_struct to charge

2729

* @nr_pages: number of pages to charge

2729

* @nr_pages: number of pages to charge

2730

* @oom: trigger OOM if reclaim fails

2730

* @oom: trigger OOM if reclaim fails

2731

*

2731

*

2732

* Returns the charged mem_cgroup associated with the given mm_struct or

2732

* Returns the charged mem_cgroup associated with the given mm_struct or

2733

* NULL the charge failed.

2733

* NULL the charge failed.

2734

*/

2734

*/

2735

static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,

2735

static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,

2736

gfp_t gfp_mask,

2736

gfp_t gfp_mask,

2737

unsigned int nr_pages,

2737

unsigned int nr_pages,

2738

bool oom)

2738

bool oom)

2739

2740

{

2740

{

2741

struct mem_cgroup *memcg;

2741

struct mem_cgroup *memcg;

2742

int ret;

2742

int ret;

2743

2744

memcg = get_mem_cgroup_from_mm(mm);

2744

memcg = get_mem_cgroup_from_mm(mm);

2745

ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);

2745

ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);

2746

css_put(&memcg->css);

2746

css_put(&memcg->css);

2747

if (ret == -EINTR)

2747

if (ret == -EINTR)

2748

memcg = root_mem_cgroup;

2748

memcg = root_mem_cgroup;

2749

else if (ret)

2749

else if (ret)

2750

memcg = NULL;

2750

memcg = NULL;

2751

2752

return memcg;

2752

return memcg;

2753

}

2753

}

2754

2755

/*

2755

/*

2756

* Somemtimes we have to undo a charge we got by try_charge().

2756

* Somemtimes we have to undo a charge we got by try_charge().

2757

* This function is for that and do uncharge, put css's refcnt.

2757

* This function is for that and do uncharge, put css's refcnt.

2758

* gotten by try_charge().

2758

* gotten by try_charge().

2759

*/

2759

*/

2760

static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,

2760

static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,

2761

unsigned int nr_pages)

2761

unsigned int nr_pages)

2762

{

2762

{

2763

if (!mem_cgroup_is_root(memcg)) {

2763

if (!mem_cgroup_is_root(memcg)) {

2764

unsigned long bytes = nr_pages * PAGE_SIZE;

2764

unsigned long bytes = nr_pages * PAGE_SIZE;

2765

2766

res_counter_uncharge(&memcg->res, bytes);

2766

res_counter_uncharge(&memcg->res, bytes);

2767

if (do_swap_account)

2767

if (do_swap_account)

2768

res_counter_uncharge(&memcg->memsw, bytes);

2768

res_counter_uncharge(&memcg->memsw, bytes);

2769

}

2769

}

2770

}

2770

}

2771

2772

/*

2772

/*

2773

* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.

2773

* Cancel chrages in this cgroup....doesn't propagate to parent cgroup.

2774

* This is useful when moving usage to parent cgroup.

2774

* This is useful when moving usage to parent cgroup.

2775

*/

2775

*/

2776

static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,

2776

static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,

2777

unsigned int nr_pages)

2777

unsigned int nr_pages)

2778

{

2778

{

2779

unsigned long bytes = nr_pages * PAGE_SIZE;

2779

unsigned long bytes = nr_pages * PAGE_SIZE;

2780

2781

if (mem_cgroup_is_root(memcg))

2781

if (mem_cgroup_is_root(memcg))

2782

return;

2782

return;

2783

2784

res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);

2784

res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);

2785

if (do_swap_account)

2785

if (do_swap_account)

2786

res_counter_uncharge_until(&memcg->memsw,

2786

res_counter_uncharge_until(&memcg->memsw,

2787

memcg->memsw.parent, bytes);

2787

memcg->memsw.parent, bytes);

2788

}

2788

}

2789

2790

/*

2790

/*

2791

* A helper function to get mem_cgroup from ID. must be called under

2791

* A helper function to get mem_cgroup from ID. must be called under

2792

* rcu_read_lock(). The caller is responsible for calling css_tryget if

2792

* rcu_read_lock(). The caller is responsible for calling css_tryget if

2793

* the mem_cgroup is used for charging. (dropping refcnt from swap can be

2793

* the mem_cgroup is used for charging. (dropping refcnt from swap can be

2794

* called against removed memcg.)

2794

* called against removed memcg.)

2795

*/

2795

*/

2796

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2796

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2797

{

2797

{

2798

/* ID 0 is unused ID */

2798

/* ID 0 is unused ID */

2799

if (!id)

2799

if (!id)

2800

return NULL;

2800

return NULL;

2801

return mem_cgroup_from_id(id);

2801

return mem_cgroup_from_id(id);

2802

}

2802

}

2803

2804

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2804

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2805

{

2805

{

2806

struct mem_cgroup *memcg = NULL;

2806

struct mem_cgroup *memcg = NULL;

2807

struct page_cgroup *pc;

2807

struct page_cgroup *pc;

2808

unsigned short id;

2808

unsigned short id;

2809

swp_entry_t ent;

2809

swp_entry_t ent;

2810

2811

VM_BUG_ON_PAGE(!PageLocked(page), page);

2811

VM_BUG_ON_PAGE(!PageLocked(page), page);

2812

2813

pc = lookup_page_cgroup(page);

2813

pc = lookup_page_cgroup(page);

2814

lock_page_cgroup(pc);

2814

lock_page_cgroup(pc);

2815

if (PageCgroupUsed(pc)) {

2815

if (PageCgroupUsed(pc)) {

2816

memcg = pc->mem_cgroup;

2816

memcg = pc->mem_cgroup;

2817

if (memcg && !css_tryget(&memcg->css))

2817

if (memcg && !css_tryget(&memcg->css))

2818

memcg = NULL;

2818

memcg = NULL;

2819

} else if (PageSwapCache(page)) {

2819

} else if (PageSwapCache(page)) {

2820

ent.val = page_private(page);

2820

ent.val = page_private(page);

2821

id = lookup_swap_cgroup_id(ent);

2821

id = lookup_swap_cgroup_id(ent);

2822

rcu_read_lock();

2822

rcu_read_lock();

2823

memcg = mem_cgroup_lookup(id);

2823

memcg = mem_cgroup_lookup(id);

2824

if (memcg && !css_tryget(&memcg->css))

2824

if (memcg && !css_tryget(&memcg->css))

2825

memcg = NULL;

2825

memcg = NULL;

2826

rcu_read_unlock();

2826

rcu_read_unlock();

2827

}

2827

}

2828

unlock_page_cgroup(pc);

2828

unlock_page_cgroup(pc);

2829

return memcg;

2829

return memcg;

2830

}

2830

}

2831

2832

static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,

2832

static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,

2833

struct page *page,

2833

struct page *page,

2834

unsigned int nr_pages,

2834

unsigned int nr_pages,

2835

enum charge_type ctype,

2835

enum charge_type ctype,

2836

bool lrucare)

2836

bool lrucare)

2837

{

2837

{

2838

struct page_cgroup *pc = lookup_page_cgroup(page);

2838

struct page_cgroup *pc = lookup_page_cgroup(page);

2839

struct zone *uninitialized_var(zone);

2839

struct zone *uninitialized_var(zone);

2840

struct lruvec *lruvec;

2840

struct lruvec *lruvec;

2841

bool was_on_lru = false;

2841

bool was_on_lru = false;

2842

bool anon;

2842

bool anon;

2843

2844

lock_page_cgroup(pc);

2844

lock_page_cgroup(pc);

2845

VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);

2845

VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);

2846

/*

2846

/*

2847

* we don't need page_cgroup_lock about tail pages, becase they are not

2847

* we don't need page_cgroup_lock about tail pages, becase they are not

2848

* accessed by any other context at this point.

2848

* accessed by any other context at this point.

2849

*/

2849

*/

2850

2851

/*

2851

/*

2852

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2852

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2853

* may already be on some other mem_cgroup's LRU. Take care of it.

2853

* may already be on some other mem_cgroup's LRU. Take care of it.

2854

*/

2854

*/

2855

if (lrucare) {

2855

if (lrucare) {

2856

zone = page_zone(page);

2856

zone = page_zone(page);

2857

spin_lock_irq(&zone->lru_lock);

2857

spin_lock_irq(&zone->lru_lock);

2858

if (PageLRU(page)) {

2858

if (PageLRU(page)) {

2859

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2859

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2860

ClearPageLRU(page);

2860

ClearPageLRU(page);

2861

del_page_from_lru_list(page, lruvec, page_lru(page));

2861

del_page_from_lru_list(page, lruvec, page_lru(page));

2862

was_on_lru = true;

2862

was_on_lru = true;

2863

}

2863

}

2864

}

2864

}

2865

2866

pc->mem_cgroup = memcg;

2866

pc->mem_cgroup = memcg;

2867

/*

2867

/*

2868

* We access a page_cgroup asynchronously without lock_page_cgroup().

2868

* We access a page_cgroup asynchronously without lock_page_cgroup().

2869

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2869

* Especially when a page_cgroup is taken from a page, pc->mem_cgroup

2870

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2870

* is accessed after testing USED bit. To make pc->mem_cgroup visible

2871

* before USED bit, we need memory barrier here.

2871

* before USED bit, we need memory barrier here.

2872

* See mem_cgroup_add_lru_list(), etc.

2872

* See mem_cgroup_add_lru_list(), etc.

2873

*/

2873

*/

2874

smp_wmb();

2874

smp_wmb();

2875

SetPageCgroupUsed(pc);

2875

SetPageCgroupUsed(pc);

2876

2877

if (lrucare) {

2877

if (lrucare) {

2878

if (was_on_lru) {

2878

if (was_on_lru) {

2879

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2879

lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);

2880

VM_BUG_ON_PAGE(PageLRU(page), page);

2880

VM_BUG_ON_PAGE(PageLRU(page), page);

2881

SetPageLRU(page);

2881

SetPageLRU(page);

2882

add_page_to_lru_list(page, lruvec, page_lru(page));

2882

add_page_to_lru_list(page, lruvec, page_lru(page));

2883

}

2883

}

2884

spin_unlock_irq(&zone->lru_lock);

2884

spin_unlock_irq(&zone->lru_lock);

2885

}

2885

}

2886

2887

if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)

2887

if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)

2888

anon = true;

2888

anon = true;

2889

else

2889

else

2890

anon = false;

2890

anon = false;

2891

2892

mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);

2892

mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);

2893

unlock_page_cgroup(pc);

2893

unlock_page_cgroup(pc);

2894

2895

/*

2895

/*

2896

* "charge_statistics" updated event counter. Then, check it.

2896

* "charge_statistics" updated event counter. Then, check it.

2897

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2897

* Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.

2898

* if they exceeds softlimit.

2898

* if they exceeds softlimit.

2899

*/

2899

*/

2900

memcg_check_events(memcg, page);

2900

memcg_check_events(memcg, page);

2901

}

2901

}

2902

2903

static DEFINE_MUTEX(set_limit_mutex);

2903

static DEFINE_MUTEX(set_limit_mutex);

2904

2905

#ifdef CONFIG_MEMCG_KMEM

2905

#ifdef CONFIG_MEMCG_KMEM

2906

static DEFINE_MUTEX(activate_kmem_mutex);

2906

static DEFINE_MUTEX(activate_kmem_mutex);

2907

2908

static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)

2908

static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)

2909

{

2909

{

2910

return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&

2910

return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&

2911

memcg_kmem_is_active(memcg);

2911

memcg_kmem_is_active(memcg);

2912

}

2912

}

2913

2914

/*

2914

/*

2915

* This is a bit cumbersome, but it is rarely used and avoids a backpointer

2915

* This is a bit cumbersome, but it is rarely used and avoids a backpointer

2916

* in the memcg_cache_params struct.

2916

* in the memcg_cache_params struct.

2917

*/

2917

*/

2918

static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)

2918

static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)

2919

{

2919

{

2920

struct kmem_cache *cachep;

2920

struct kmem_cache *cachep;

2921

2922

VM_BUG_ON(p->is_root_cache);

2922

VM_BUG_ON(p->is_root_cache);

2923

cachep = p->root_cache;

2923

cachep = p->root_cache;

2924

return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));

2924

return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));

2925

}

2925

}

2926

2927

#ifdef CONFIG_SLABINFO

2927

#ifdef CONFIG_SLABINFO

2928

static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)

2928

static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)

2929

{

2929

{

2930

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

2930

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

2931

struct memcg_cache_params *params;

2931

struct memcg_cache_params *params;

2932

2933

if (!memcg_can_account_kmem(memcg))

2933

if (!memcg_can_account_kmem(memcg))

2934

return -EIO;

2934

return -EIO;

2935

2936

print_slabinfo_header(m);

2936

print_slabinfo_header(m);

2937

2938

mutex_lock(&memcg->slab_caches_mutex);

2938

mutex_lock(&memcg->slab_caches_mutex);

2939

list_for_each_entry(params, &memcg->memcg_slab_caches, list)

2939

list_for_each_entry(params, &memcg->memcg_slab_caches, list)

2940

cache_show(memcg_params_to_cache(params), m);

2940

cache_show(memcg_params_to_cache(params), m);

2941

mutex_unlock(&memcg->slab_caches_mutex);

2941

mutex_unlock(&memcg->slab_caches_mutex);

2942

2943

return 0;

2943

return 0;

2944

}

2944

}

2945

#endif

2945

#endif

2946

2947

static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)

2947

static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)

2948

{

2948

{

2949

struct res_counter *fail_res;

2949

struct res_counter *fail_res;

2950

int ret = 0;

2950

int ret = 0;

2951

2952

ret = res_counter_charge(&memcg->kmem, size, &fail_res);

2952

ret = res_counter_charge(&memcg->kmem, size, &fail_res);

2953

if (ret)

2953

if (ret)

2954

return ret;

2954

return ret;

2955

2956

ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,

2956

ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,

2957

oom_gfp_allowed(gfp));

2957

oom_gfp_allowed(gfp));

2958

if (ret == -EINTR) {

2958

if (ret == -EINTR) {

2959

/*

2959

/*

2960

* mem_cgroup_try_charge() chosed to bypass to root due to

2960

* mem_cgroup_try_charge() chosed to bypass to root due to

2961

* OOM kill or fatal signal. Since our only options are to

2961

* OOM kill or fatal signal. Since our only options are to

2962

* either fail the allocation or charge it to this cgroup, do

2962

* either fail the allocation or charge it to this cgroup, do

2963

* it as a temporary condition. But we can't fail. From a

2963

* it as a temporary condition. But we can't fail. From a

2964

* kmem/slab perspective, the cache has already been selected,

2964

* kmem/slab perspective, the cache has already been selected,

2965

* by mem_cgroup_kmem_get_cache(), so it is too late to change

2965

* by mem_cgroup_kmem_get_cache(), so it is too late to change

2966

* our minds.

2966

* our minds.

2967

*

2967

*

2968

* This condition will only trigger if the task entered

2968

* This condition will only trigger if the task entered

2969

* memcg_charge_kmem in a sane state, but was OOM-killed during

2969

* memcg_charge_kmem in a sane state, but was OOM-killed during

2970

* mem_cgroup_try_charge() above. Tasks that were already

2970

* mem_cgroup_try_charge() above. Tasks that were already

2971

* dying when the allocation triggers should have been already

2971

* dying when the allocation triggers should have been already

2972

* directed to the root cgroup in memcontrol.h

2972

* directed to the root cgroup in memcontrol.h

2973

*/

2973

*/

2974

res_counter_charge_nofail(&memcg->res, size, &fail_res);

2974

res_counter_charge_nofail(&memcg->res, size, &fail_res);

2975

if (do_swap_account)

2975

if (do_swap_account)

2976

res_counter_charge_nofail(&memcg->memsw, size,

2976

res_counter_charge_nofail(&memcg->memsw, size,

2977

&fail_res);

2977

&fail_res);

2978

ret = 0;

2978

ret = 0;

2979

} else if (ret)

2979

} else if (ret)

2980

res_counter_uncharge(&memcg->kmem, size);

2980

res_counter_uncharge(&memcg->kmem, size);

2981

2982

return ret;

2982

return ret;

2983

}

2983

}

2984

2985

static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)

2985

static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)

2986

{

2986

{

2987

res_counter_uncharge(&memcg->res, size);

2987

res_counter_uncharge(&memcg->res, size);

2988

if (do_swap_account)

2988

if (do_swap_account)

2989

res_counter_uncharge(&memcg->memsw, size);

2989

res_counter_uncharge(&memcg->memsw, size);

2990

2991

/* Not down to 0 */

2991

/* Not down to 0 */

2992

if (res_counter_uncharge(&memcg->kmem, size))

2992

if (res_counter_uncharge(&memcg->kmem, size))

2993

return;

2993

return;

2994

2995

/*

2995

/*

2996

* Releases a reference taken in kmem_cgroup_css_offline in case

2996

* Releases a reference taken in kmem_cgroup_css_offline in case

2997

* this last uncharge is racing with the offlining code or it is

2997

* this last uncharge is racing with the offlining code or it is

2998

* outliving the memcg existence.

2998

* outliving the memcg existence.

2999

*

2999

*

3000

* The memory barrier imposed by test&clear is paired with the

3000

* The memory barrier imposed by test&clear is paired with the

3001

* explicit one in memcg_kmem_mark_dead().

3001

* explicit one in memcg_kmem_mark_dead().

3002

*/

3002

*/

3003

if (memcg_kmem_test_and_clear_dead(memcg))

3003

if (memcg_kmem_test_and_clear_dead(memcg))

3004

css_put(&memcg->css);

3004

css_put(&memcg->css);

3005

}

3005

}

3006

3007

/*

3007

/*

3008

* helper for acessing a memcg's index. It will be used as an index in the

3008

* helper for acessing a memcg's index. It will be used as an index in the

3009

* child cache array in kmem_cache, and also to derive its name. This function

3009

* child cache array in kmem_cache, and also to derive its name. This function

3010

* will return -1 when this is not a kmem-limited memcg.

3010

* will return -1 when this is not a kmem-limited memcg.

3011

*/

3011

*/

3012

int memcg_cache_id(struct mem_cgroup *memcg)

3012

int memcg_cache_id(struct mem_cgroup *memcg)

3013

{

3013

{

3014

return memcg ? memcg->kmemcg_id : -1;

3014

return memcg ? memcg->kmemcg_id : -1;

3015

}

3015

}

3016

3017

static size_t memcg_caches_array_size(int num_groups)

3017

static size_t memcg_caches_array_size(int num_groups)

3018

{

3018

{

3019

ssize_t size;

3019

ssize_t size;

3020

if (num_groups <= 0)

3020

if (num_groups <= 0)

3021

return 0;

3021

return 0;

3022

3023

size = 2 * num_groups;

3023

size = 2 * num_groups;

3024

if (size < MEMCG_CACHES_MIN_SIZE)

3024

if (size < MEMCG_CACHES_MIN_SIZE)

3025

size = MEMCG_CACHES_MIN_SIZE;

3025

size = MEMCG_CACHES_MIN_SIZE;

3026

else if (size > MEMCG_CACHES_MAX_SIZE)

3026

else if (size > MEMCG_CACHES_MAX_SIZE)

3027

size = MEMCG_CACHES_MAX_SIZE;

3027

size = MEMCG_CACHES_MAX_SIZE;

3028

3029

return size;

3029

return size;

3030

}

3030

}

3031

3032

/*

3032

/*

3033

* We should update the current array size iff all caches updates succeed. This

3033

* We should update the current array size iff all caches updates succeed. This

3034

* can only be done from the slab side. The slab mutex needs to be held when

3034

* can only be done from the slab side. The slab mutex needs to be held when

3035

* calling this.

3035

* calling this.

3036

*/

3036

*/

3037

void memcg_update_array_size(int num)

3037

void memcg_update_array_size(int num)

3038

{

3038

{

3039

if (num > memcg_limited_groups_array_size)

3039

if (num > memcg_limited_groups_array_size)

3040

memcg_limited_groups_array_size = memcg_caches_array_size(num);

3040

memcg_limited_groups_array_size = memcg_caches_array_size(num);

3041

}

3041

}

3042

3043

static void kmem_cache_destroy_work_func(struct work_struct *w);

3043

static void kmem_cache_destroy_work_func(struct work_struct *w);

3044

3045

int memcg_update_cache_size(struct kmem_cache *s, int num_groups)

3045

int memcg_update_cache_size(struct kmem_cache *s, int num_groups)

3046

{

3046

{

3047

struct memcg_cache_params *cur_params = s->memcg_params;

3047

struct memcg_cache_params *cur_params = s->memcg_params;

3048

3049

VM_BUG_ON(!is_root_cache(s));

3049

VM_BUG_ON(!is_root_cache(s));

3050

3051

if (num_groups > memcg_limited_groups_array_size) {

3051

if (num_groups > memcg_limited_groups_array_size) {

3052

int i;

3052

int i;

3053

struct memcg_cache_params *new_params;

3053

struct memcg_cache_params *new_params;

3054

ssize_t size = memcg_caches_array_size(num_groups);

3054

ssize_t size = memcg_caches_array_size(num_groups);

3055

3056

size *= sizeof(void *);

3056

size *= sizeof(void *);

3057

size += offsetof(struct memcg_cache_params, memcg_caches);

3057

size += offsetof(struct memcg_cache_params, memcg_caches);

3058

3059

new_params = kzalloc(size, GFP_KERNEL);

3059

new_params = kzalloc(size, GFP_KERNEL);

3060

if (!new_params)

3060

if (!new_params)

3061

return -ENOMEM;

3061

return -ENOMEM;

3062

3063

new_params->is_root_cache = true;

3063

new_params->is_root_cache = true;

3064

3065

/*

3065

/*

3066

* There is the chance it will be bigger than

3066

* There is the chance it will be bigger than

3067

* memcg_limited_groups_array_size, if we failed an allocation

3067

* memcg_limited_groups_array_size, if we failed an allocation

3068

* in a cache, in which case all caches updated before it, will

3068

* in a cache, in which case all caches updated before it, will

3069

* have a bigger array.

3069

* have a bigger array.

3070

*

3070

*

3071

* But if that is the case, the data after

3071

* But if that is the case, the data after

3072

* memcg_limited_groups_array_size is certainly unused

3072

* memcg_limited_groups_array_size is certainly unused

3073

*/

3073

*/

3074

for (i = 0; i < memcg_limited_groups_array_size; i++) {

3074

for (i = 0; i < memcg_limited_groups_array_size; i++) {

3075

if (!cur_params->memcg_caches[i])

3075

if (!cur_params->memcg_caches[i])

3076

continue;

3076

continue;

3077

new_params->memcg_caches[i] =

3077

new_params->memcg_caches[i] =

3078

cur_params->memcg_caches[i];

3078

cur_params->memcg_caches[i];

3079

}

3079

}

3080

3081

/*

3081

/*

3082

* Ideally, we would wait until all caches succeed, and only

3082

* Ideally, we would wait until all caches succeed, and only

3083

* then free the old one. But this is not worth the extra

3083

* then free the old one. But this is not worth the extra

3084

* pointer per-cache we'd have to have for this.

3084

* pointer per-cache we'd have to have for this.

3085

*

3085

*

3086

* It is not a big deal if some caches are left with a size

3086

* It is not a big deal if some caches are left with a size

3087

* bigger than the others. And all updates will reset this

3087

* bigger than the others. And all updates will reset this

3088

* anyway.

3088

* anyway.

3089

*/

3089

*/

3090

rcu_assign_pointer(s->memcg_params, new_params);

3090

rcu_assign_pointer(s->memcg_params, new_params);

3091

if (cur_params)

3091

if (cur_params)

3092

kfree_rcu(cur_params, rcu_head);

3092

kfree_rcu(cur_params, rcu_head);

3093

}

3093

}

3094

return 0;

3094

return 0;

3095

}

3095

}

3096

3097

char *memcg_create_cache_name(struct mem_cgroup *memcg,

3097

char *memcg_create_cache_name(struct mem_cgroup *memcg,

3098

struct kmem_cache *root_cache)

3098

struct kmem_cache *root_cache)

3099

{

3099

{

3100

static char *buf = NULL;

3100

static char *buf = NULL;

3101

3102

/*

3102

/*

3103

* We need a mutex here to protect the shared buffer. Since this is

3103

* We need a mutex here to protect the shared buffer. Since this is

3104

* expected to be called only on cache creation, we can employ the

3104

* expected to be called only on cache creation, we can employ the

3105

* slab_mutex for that purpose.

3105

* slab_mutex for that purpose.

3106

*/

3106

*/

3107

lockdep_assert_held(&slab_mutex);

3107

lockdep_assert_held(&slab_mutex);

3108

3109

if (!buf) {

3109

if (!buf) {

3110

buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

3110

buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);

3111

if (!buf)

3111

if (!buf)

3112

return NULL;

3112

return NULL;

3113

}

3113

}

3114

3115

cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);

3115

cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);

3116

return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,

3116

return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,

3117

memcg_cache_id(memcg), buf);

3117

memcg_cache_id(memcg), buf);

3118

}

3118

}

3119

3120

int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,

3120

int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,

3121

struct kmem_cache *root_cache)

3121

struct kmem_cache *root_cache)

3122

{

3122

{

3123

size_t size;

3123

size_t size;

3124

3125

if (!memcg_kmem_enabled())

3125

if (!memcg_kmem_enabled())

3126

return 0;

3126

return 0;

3127

3128

if (!memcg) {

3128

if (!memcg) {

3129

size = offsetof(struct memcg_cache_params, memcg_caches);

3129

size = offsetof(struct memcg_cache_params, memcg_caches);

3130

size += memcg_limited_groups_array_size * sizeof(void *);

3130

size += memcg_limited_groups_array_size * sizeof(void *);

3131

} else

3131

} else

3132

size = sizeof(struct memcg_cache_params);

3132

size = sizeof(struct memcg_cache_params);

3133

3134

s->memcg_params = kzalloc(size, GFP_KERNEL);

3134

s->memcg_params = kzalloc(size, GFP_KERNEL);

3135

if (!s->memcg_params)

3135

if (!s->memcg_params)

3136

return -ENOMEM;

3136

return -ENOMEM;

3137

3138

if (memcg) {

3138

if (memcg) {

3139

s->memcg_params->memcg = memcg;

3139

s->memcg_params->memcg = memcg;

3140

s->memcg_params->root_cache = root_cache;

3140

s->memcg_params->root_cache = root_cache;

3141

INIT_WORK(&s->memcg_params->destroy,

3141

INIT_WORK(&s->memcg_params->destroy,

3142

kmem_cache_destroy_work_func);

3142

kmem_cache_destroy_work_func);

3143

} else

3143

} else

3144

s->memcg_params->is_root_cache = true;

3144

s->memcg_params->is_root_cache = true;

3145

3146

return 0;

3146

return 0;

3147

}

3147

}

3148

3149

void memcg_free_cache_params(struct kmem_cache *s)

3149

void memcg_free_cache_params(struct kmem_cache *s)

3150

{

3150

{

3151

kfree(s->memcg_params);

3151

kfree(s->memcg_params);

3152

}

3152

}

3153

3154

void memcg_register_cache(struct kmem_cache *s)

3154

void memcg_register_cache(struct kmem_cache *s)

3155

{

3155

{

3156

struct kmem_cache *root;

3156

struct kmem_cache *root;

3157

struct mem_cgroup *memcg;

3157

struct mem_cgroup *memcg;

3158

int id;

3158

int id;

3159

3160

if (is_root_cache(s))

3160

if (is_root_cache(s))

3161

return;

3161

return;

3162

3163

/*

3163

/*

3164

* Holding the slab_mutex assures nobody will touch the memcg_caches

3164

* Holding the slab_mutex assures nobody will touch the memcg_caches

3165

* array while we are modifying it.

3165

* array while we are modifying it.

3166

*/

3166

*/

3167

lockdep_assert_held(&slab_mutex);

3167

lockdep_assert_held(&slab_mutex);

3168

3169

root = s->memcg_params->root_cache;

3169

root = s->memcg_params->root_cache;

3170

memcg = s->memcg_params->memcg;

3170

memcg = s->memcg_params->memcg;

3171

id = memcg_cache_id(memcg);

3171

id = memcg_cache_id(memcg);

3172

3173

css_get(&memcg->css);

3173

css_get(&memcg->css);

3174

3175

3176

/*

3176

/*

3177

* Since readers won't lock (see cache_from_memcg_idx()), we need a

3177

* Since readers won't lock (see cache_from_memcg_idx()), we need a

3178

* barrier here to ensure nobody will see the kmem_cache partially

3178

* barrier here to ensure nobody will see the kmem_cache partially

3179

* initialized.

3179

* initialized.

3180

*/

3180

*/

3181

smp_wmb();

3181

smp_wmb();

3182

3183

/*

3183

/*

3184

* Initialize the pointer to this cache in its parent's memcg_params

3184

* Initialize the pointer to this cache in its parent's memcg_params

3185

* before adding it to the memcg_slab_caches list, otherwise we can

3185

* before adding it to the memcg_slab_caches list, otherwise we can

3186

* fail to convert memcg_params_to_cache() while traversing the list.

3186

* fail to convert memcg_params_to_cache() while traversing the list.

3187

*/

3187

*/

3188

VM_BUG_ON(root->memcg_params->memcg_caches[id]);

3188

VM_BUG_ON(root->memcg_params->memcg_caches[id]);

3189

root->memcg_params->memcg_caches[id] = s;

3189

root->memcg_params->memcg_caches[id] = s;

3190

3191

mutex_lock(&memcg->slab_caches_mutex);

3191

mutex_lock(&memcg->slab_caches_mutex);

3192

list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);

3192

list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);

3193

mutex_unlock(&memcg->slab_caches_mutex);

3193

mutex_unlock(&memcg->slab_caches_mutex);

3194

}

3194

}

3195

3196

void memcg_unregister_cache(struct kmem_cache *s)

3196

void memcg_unregister_cache(struct kmem_cache *s)

3197

{

3197

{

3198

struct kmem_cache *root;

3198

struct kmem_cache *root;

3199

struct mem_cgroup *memcg;

3199

struct mem_cgroup *memcg;

3200

int id;

3200

int id;

3201

3202

if (is_root_cache(s))

3202

if (is_root_cache(s))

3203

return;

3203

return;

3204

3205

/*

3205

/*

3206

* Holding the slab_mutex assures nobody will touch the memcg_caches

3206

* Holding the slab_mutex assures nobody will touch the memcg_caches

3207

* array while we are modifying it.

3207

* array while we are modifying it.

3208

*/

3208

*/

3209

lockdep_assert_held(&slab_mutex);

3209

lockdep_assert_held(&slab_mutex);

3210

3211

root = s->memcg_params->root_cache;

3211

root = s->memcg_params->root_cache;

3212

memcg = s->memcg_params->memcg;

3212

memcg = s->memcg_params->memcg;

3213

id = memcg_cache_id(memcg);

3213

id = memcg_cache_id(memcg);

3214

3215

mutex_lock(&memcg->slab_caches_mutex);

3215

mutex_lock(&memcg->slab_caches_mutex);

3216

list_del(&s->memcg_params->list);

3216

list_del(&s->memcg_params->list);

3217

mutex_unlock(&memcg->slab_caches_mutex);

3217

mutex_unlock(&memcg->slab_caches_mutex);

3218

3219

/*

3219

/*

3220

* Clear the pointer to this cache in its parent's memcg_params only

3220

* Clear the pointer to this cache in its parent's memcg_params only

3221

* after removing it from the memcg_slab_caches list, otherwise we can

3221

* after removing it from the memcg_slab_caches list, otherwise we can

3222

* fail to convert memcg_params_to_cache() while traversing the list.

3222

* fail to convert memcg_params_to_cache() while traversing the list.

3223

*/

3223

*/

3224

VM_BUG_ON(!root->memcg_params->memcg_caches[id]);

3224

VM_BUG_ON(!root->memcg_params->memcg_caches[id]);

3225

root->memcg_params->memcg_caches[id] = NULL;

3225

root->memcg_params->memcg_caches[id] = NULL;

3226

3227

css_put(&memcg->css);

3227

css_put(&memcg->css);

3228

}

3228

}

3229

3230

/*

3230

/*

3231

* During the creation a new cache, we need to disable our accounting mechanism

3231

* During the creation a new cache, we need to disable our accounting mechanism

3232

* altogether. This is true even if we are not creating, but rather just

3232

* altogether. This is true even if we are not creating, but rather just

3233

* enqueing new caches to be created.

3233

* enqueing new caches to be created.

3234

*

3234

*

3235

* This is because that process will trigger allocations; some visible, like

3235

* This is because that process will trigger allocations; some visible, like

3236

* explicit kmallocs to auxiliary data structures, name strings and internal

3236

* explicit kmallocs to auxiliary data structures, name strings and internal

3237

* cache structures; some well concealed, like INIT_WORK() that can allocate

3237

* cache structures; some well concealed, like INIT_WORK() that can allocate

3238

* objects during debug.

3238

* objects during debug.

3239

*

3239

*

3240

* If any allocation happens during memcg_kmem_get_cache, we will recurse back

3240

* If any allocation happens during memcg_kmem_get_cache, we will recurse back

3241

* to it. This may not be a bounded recursion: since the first cache creation

3241

* to it. This may not be a bounded recursion: since the first cache creation

3242

* failed to complete (waiting on the allocation), we'll just try to create the

3242

* failed to complete (waiting on the allocation), we'll just try to create the

3243

* cache again, failing at the same point.

3243

* cache again, failing at the same point.

3244

*

3244

*

3245

* memcg_kmem_get_cache is prepared to abort after seeing a positive count of

3245

* memcg_kmem_get_cache is prepared to abort after seeing a positive count of

3246

* memcg_kmem_skip_account. So we enclose anything that might allocate memory

3246

* memcg_kmem_skip_account. So we enclose anything that might allocate memory

3247

* inside the following two functions.

3247

* inside the following two functions.

3248

*/

3248

*/

3249

static inline void memcg_stop_kmem_account(void)

3249

static inline void memcg_stop_kmem_account(void)

3250

{

3250

{

3251

VM_BUG_ON(!current->mm);

3251

VM_BUG_ON(!current->mm);

3252

current->memcg_kmem_skip_account++;

3252

current->memcg_kmem_skip_account++;

3253

}

3253

}

3254

3255

static inline void memcg_resume_kmem_account(void)

3255

static inline void memcg_resume_kmem_account(void)

3256

{

3256

{

3257

VM_BUG_ON(!current->mm);

3257

VM_BUG_ON(!current->mm);

3258

current->memcg_kmem_skip_account--;

3258

current->memcg_kmem_skip_account--;

3259

}

3259

}

3260

3261

static void kmem_cache_destroy_work_func(struct work_struct *w)

3261

static void kmem_cache_destroy_work_func(struct work_struct *w)

3262

{

3262

{

3263

struct kmem_cache *cachep;

3263

struct kmem_cache *cachep;

3264

struct memcg_cache_params *p;

3264

struct memcg_cache_params *p;

3265

3266

p = container_of(w, struct memcg_cache_params, destroy);

3266

p = container_of(w, struct memcg_cache_params, destroy);

3267

3268

cachep = memcg_params_to_cache(p);

3268

cachep = memcg_params_to_cache(p);

3269

3270

/*

3270

/*

3271

* If we get down to 0 after shrink, we could delete right away.

3271

* If we get down to 0 after shrink, we could delete right away.

3272

* However, memcg_release_pages() already puts us back in the workqueue

3272

* However, memcg_release_pages() already puts us back in the workqueue

3273

* in that case. If we proceed deleting, we'll get a dangling

3273

* in that case. If we proceed deleting, we'll get a dangling

3274

* reference, and removing the object from the workqueue in that case

3274

* reference, and removing the object from the workqueue in that case

3275

* is unnecessary complication. We are not a fast path.

3275

* is unnecessary complication. We are not a fast path.

3276

*

3276

*

3277

* Note that this case is fundamentally different from racing with

3277

* Note that this case is fundamentally different from racing with

3278

* shrink_slab(): if memcg_cgroup_destroy_cache() is called in

3278

* shrink_slab(): if memcg_cgroup_destroy_cache() is called in

3279

* kmem_cache_shrink, not only we would be reinserting a dead cache

3279

* kmem_cache_shrink, not only we would be reinserting a dead cache

3280

* into the queue, but doing so from inside the worker racing to

3280

* into the queue, but doing so from inside the worker racing to

3281

* destroy it.

3281

* destroy it.

3282

*

3282

*

3283

* So if we aren't down to zero, we'll just schedule a worker and try

3283

* So if we aren't down to zero, we'll just schedule a worker and try

3284

* again

3284

* again

3285

*/

3285

*/

3286

if (atomic_read(&cachep->memcg_params->nr_pages) != 0)

3286

if (atomic_read(&cachep->memcg_params->nr_pages) != 0)

3287

kmem_cache_shrink(cachep);

3287

kmem_cache_shrink(cachep);

3288

else

3288

else

3289

kmem_cache_destroy(cachep);

3289

kmem_cache_destroy(cachep);

3290

}

3290

}

3291

3292

void mem_cgroup_destroy_cache(struct kmem_cache *cachep)

3292

void mem_cgroup_destroy_cache(struct kmem_cache *cachep)

3293

{

3293

{

3294

if (!cachep->memcg_params->dead)

3294

if (!cachep->memcg_params->dead)

3295

return;

3295

return;

3296

3297

/*

3297

/*

3298

* There are many ways in which we can get here.

3298

* There are many ways in which we can get here.

3299

*

3299

*

3300

* We can get to a memory-pressure situation while the delayed work is

3300

* We can get to a memory-pressure situation while the delayed work is

3301

* still pending to run. The vmscan shrinkers can then release all

3301

* still pending to run. The vmscan shrinkers can then release all

3302

* cache memory and get us to destruction. If this is the case, we'll

3302

* cache memory and get us to destruction. If this is the case, we'll

3303

* be executed twice, which is a bug (the second time will execute over

3303

* be executed twice, which is a bug (the second time will execute over

3304

* bogus data). In this case, cancelling the work should be fine.

3304

* bogus data). In this case, cancelling the work should be fine.

3305

*

3305

*

3306

* But we can also get here from the worker itself, if

3306

* But we can also get here from the worker itself, if

3307

* kmem_cache_shrink is enough to shake all the remaining objects and

3307

* kmem_cache_shrink is enough to shake all the remaining objects and

3308

* get the page count to 0. In this case, we'll deadlock if we try to

3308

* get the page count to 0. In this case, we'll deadlock if we try to

3309

* cancel the work (the worker runs with an internal lock held, which

3309

* cancel the work (the worker runs with an internal lock held, which

3310

* is the same lock we would hold for cancel_work_sync().)

3310

* is the same lock we would hold for cancel_work_sync().)

3311

*

3311

*

3312

* Since we can't possibly know who got us here, just refrain from

3312

* Since we can't possibly know who got us here, just refrain from

3313

* running if there is already work pending

3313

* running if there is already work pending

3314

*/

3314

*/

3315

if (work_pending(&cachep->memcg_params->destroy))

3315

if (work_pending(&cachep->memcg_params->destroy))

3316

return;

3316

return;

3317

/*

3317

/*

3318

* We have to defer the actual destroying to a workqueue, because

3318

* We have to defer the actual destroying to a workqueue, because

3319

* we might currently be in a context that cannot sleep.

3319

* we might currently be in a context that cannot sleep.

3320

*/

3320

*/

3321

schedule_work(&cachep->memcg_params->destroy);

3321

schedule_work(&cachep->memcg_params->destroy);

3322

}

3322

}

3323

3324

void kmem_cache_destroy_memcg_children(struct kmem_cache *s)

3324

void kmem_cache_destroy_memcg_children(struct kmem_cache *s)

3325

{

3325

{

3326

struct kmem_cache *c;

3326

struct kmem_cache *c;

3327

int i;

3327

int i;

3328

3329

if (!s->memcg_params)

3329

if (!s->memcg_params)

3330

return;

3330

return;

3331

if (!s->memcg_params->is_root_cache)

3331

if (!s->memcg_params->is_root_cache)

3332

return;

3332

return;

3333

3334

/*

3334

/*

3335

* If the cache is being destroyed, we trust that there is no one else

3335

* If the cache is being destroyed, we trust that there is no one else

3336

* requesting objects from it. Even if there are, the sanity checks in

3336

* requesting objects from it. Even if there are, the sanity checks in

3337

* kmem_cache_destroy should caught this ill-case.

3337

* kmem_cache_destroy should caught this ill-case.

3338

*

3338

*

3339

* Still, we don't want anyone else freeing memcg_caches under our

3339

* Still, we don't want anyone else freeing memcg_caches under our

3340

* noses, which can happen if a new memcg comes to life. As usual,

3340

* noses, which can happen if a new memcg comes to life. As usual,

3341

* we'll take the activate_kmem_mutex to protect ourselves against

3341

* we'll take the activate_kmem_mutex to protect ourselves against

3342

* this.

3342

* this.

3343

*/

3343

*/

3344

mutex_lock(&activate_kmem_mutex);

3344

mutex_lock(&activate_kmem_mutex);

3345

for_each_memcg_cache_index(i) {

3345

for_each_memcg_cache_index(i) {

3346

c = cache_from_memcg_idx(s, i);

3346

c = cache_from_memcg_idx(s, i);

3347

if (!c)

3347

if (!c)

3348

continue;

3348

continue;

3349

3350

/*

3350

/*

3351

* We will now manually delete the caches, so to avoid races

3351

* We will now manually delete the caches, so to avoid races

3352

* we need to cancel all pending destruction workers and

3352

* we need to cancel all pending destruction workers and

3353

* proceed with destruction ourselves.

3353

* proceed with destruction ourselves.

3354

*

3354

*

3355

* kmem_cache_destroy() will call kmem_cache_shrink internally,

3355

* kmem_cache_destroy() will call kmem_cache_shrink internally,

3356

* and that could spawn the workers again: it is likely that

3356

* and that could spawn the workers again: it is likely that

3357

* the cache still have active pages until this very moment.

3357

* the cache still have active pages until this very moment.

3358

* This would lead us back to mem_cgroup_destroy_cache.

3358

* This would lead us back to mem_cgroup_destroy_cache.

3359

*

3359

*

3360

* But that will not execute at all if the "dead" flag is not

3360

* But that will not execute at all if the "dead" flag is not

3361

* set, so flip it down to guarantee we are in control.

3361

* set, so flip it down to guarantee we are in control.

3362

*/

3362

*/

3363

c->memcg_params->dead = false;

3363

c->memcg_params->dead = false;

3364

cancel_work_sync(&c->memcg_params->destroy);

3364

cancel_work_sync(&c->memcg_params->destroy);

3365

kmem_cache_destroy(c);

3365

kmem_cache_destroy(c);

3366

}

3366

}

3367

mutex_unlock(&activate_kmem_mutex);

3367

mutex_unlock(&activate_kmem_mutex);

3368

}

3368

}

3369

3370

static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)

3370

static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)

3371

{

3371

{

3372

struct kmem_cache *cachep;

3372

struct kmem_cache *cachep;

3373

struct memcg_cache_params *params;

3373

struct memcg_cache_params *params;

3374

3375

if (!memcg_kmem_is_active(memcg))

3375

if (!memcg_kmem_is_active(memcg))

3376

return;

3376

return;

3377

3378

mutex_lock(&memcg->slab_caches_mutex);

3378

mutex_lock(&memcg->slab_caches_mutex);

3379

list_for_each_entry(params, &memcg->memcg_slab_caches, list) {

3379

list_for_each_entry(params, &memcg->memcg_slab_caches, list) {

3380

cachep = memcg_params_to_cache(params);

3380

cachep = memcg_params_to_cache(params);

3381

cachep->memcg_params->dead = true;

3381

cachep->memcg_params->dead = true;

3382

schedule_work(&cachep->memcg_params->destroy);

3382

schedule_work(&cachep->memcg_params->destroy);

3383

}

3383

}

3384

mutex_unlock(&memcg->slab_caches_mutex);

3384

mutex_unlock(&memcg->slab_caches_mutex);

3385

}

3385

}

3386

3387

struct create_work {

3387

struct create_work {

3388

struct mem_cgroup *memcg;

3388

struct mem_cgroup *memcg;

3389

struct kmem_cache *cachep;

3389

struct kmem_cache *cachep;

3390

struct work_struct work;

3390

struct work_struct work;

3391

};

3391

};

3392

3393

static void memcg_create_cache_work_func(struct work_struct *w)

3393

static void memcg_create_cache_work_func(struct work_struct *w)

3394

{

3394

{

3395

struct create_work *cw = container_of(w, struct create_work, work);

3395

struct create_work *cw = container_of(w, struct create_work, work);

3396

struct mem_cgroup *memcg = cw->memcg;

3396

struct mem_cgroup *memcg = cw->memcg;

3397

struct kmem_cache *cachep = cw->cachep;

3397

struct kmem_cache *cachep = cw->cachep;

3398

struct kmem_cache *new;

3399

3398

3400

new = kmem_cache_create_memcg(memcg, cachep->name,

3399

kmem_cache_create_memcg(memcg, cachep);

3401

cachep->object_size, cachep->align,

3402

cachep->flags & ~SLAB_PANIC, cachep->ctor, cachep);

3403

if (new)

3404

new->allocflags |= __GFP_KMEMCG;

3405

css_put(&memcg->css);

3400

css_put(&memcg->css);

3406

kfree(cw);

3401

kfree(cw);

3407

}

3402

}

3408

3403

3409

/*

3404

/*

3410

* Enqueue the creation of a per-memcg kmem_cache.

3405

* Enqueue the creation of a per-memcg kmem_cache.

3411

*/

3406

*/

3412

static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,

3407

static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,

3413

struct kmem_cache *cachep)

3408

struct kmem_cache *cachep)

3414

{

3409

{

3415

struct create_work *cw;

3410

struct create_work *cw;

3416

3411

3417

cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);

3412

cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);

3418

if (cw == NULL) {

3413

if (cw == NULL) {

3419

css_put(&memcg->css);

3414

css_put(&memcg->css);

3420

return;

3415

return;

3421

}

3416

}

3422

3417

3423

cw->memcg = memcg;

3418

cw->memcg = memcg;

3424

cw->cachep = cachep;

3419

cw->cachep = cachep;

3425

3420

3426

INIT_WORK(&cw->work, memcg_create_cache_work_func);

3421

INIT_WORK(&cw->work, memcg_create_cache_work_func);

3427

schedule_work(&cw->work);

3422

schedule_work(&cw->work);

3428

}

3423

}

3429

3424

3430

static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,

3425

static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,

3431

struct kmem_cache *cachep)

3426

struct kmem_cache *cachep)

3432

{

3427

{

3433

/*

3428

/*

3434

* We need to stop accounting when we kmalloc, because if the

3429

* We need to stop accounting when we kmalloc, because if the

3435

* corresponding kmalloc cache is not yet created, the first allocation

3430

* corresponding kmalloc cache is not yet created, the first allocation

3436

* in __memcg_create_cache_enqueue will recurse.

3431

* in __memcg_create_cache_enqueue will recurse.

3437

*

3432

*

3438

* However, it is better to enclose the whole function. Depending on

3433

* However, it is better to enclose the whole function. Depending on

3439

* the debugging options enabled, INIT_WORK(), for instance, can

3434

* the debugging options enabled, INIT_WORK(), for instance, can

3440

* trigger an allocation. This too, will make us recurse. Because at

3435

* trigger an allocation. This too, will make us recurse. Because at

3441

* this point we can't allow ourselves back into memcg_kmem_get_cache,

3436

* this point we can't allow ourselves back into memcg_kmem_get_cache,

3442

* the safest choice is to do it like this, wrapping the whole function.

3437

* the safest choice is to do it like this, wrapping the whole function.

3443

*/

3438

*/

3444

memcg_stop_kmem_account();

3439

memcg_stop_kmem_account();

3445

__memcg_create_cache_enqueue(memcg, cachep);

3440

__memcg_create_cache_enqueue(memcg, cachep);

3446

memcg_resume_kmem_account();

3441

memcg_resume_kmem_account();

3447

}

3442

}

3448

/*

3443

/*

3449

* Return the kmem_cache we're supposed to use for a slab allocation.

3444

* Return the kmem_cache we're supposed to use for a slab allocation.

3450

* We try to use the current memcg's version of the cache.

3445

* We try to use the current memcg's version of the cache.

3451

*

3446

*

3452

* If the cache does not exist yet, if we are the first user of it,

3447

* If the cache does not exist yet, if we are the first user of it,

3453

* we either create it immediately, if possible, or create it asynchronously

3448

* we either create it immediately, if possible, or create it asynchronously

3454

* in a workqueue.

3449

* in a workqueue.

3455

* In the latter case, we will let the current allocation go through with

3450

* In the latter case, we will let the current allocation go through with

3456

* the original cache.

3451

* the original cache.

3457

*

3452

*

3458

* Can't be called in interrupt context or from kernel threads.

3453

* Can't be called in interrupt context or from kernel threads.

3459

* This function needs to be called with rcu_read_lock() held.

3454

* This function needs to be called with rcu_read_lock() held.

3460

*/

3455

*/

3461

struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,

3456

struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,

3462

gfp_t gfp)

3457

gfp_t gfp)

3463

{

3458

{

3464

struct mem_cgroup *memcg;

3459

struct mem_cgroup *memcg;

3465

struct kmem_cache *memcg_cachep;

3460

struct kmem_cache *memcg_cachep;

3466

3461

3467

VM_BUG_ON(!cachep->memcg_params);

3462

VM_BUG_ON(!cachep->memcg_params);

3468

VM_BUG_ON(!cachep->memcg_params->is_root_cache);

3463

VM_BUG_ON(!cachep->memcg_params->is_root_cache);

3469

3464

3470

if (!current->mm || current->memcg_kmem_skip_account)

3465

if (!current->mm || current->memcg_kmem_skip_account)

3471

return cachep;

3466

return cachep;

3472

3467

3473

rcu_read_lock();

3468

rcu_read_lock();

3474

memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));

3469

memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));

3475

3470

3476

if (!memcg_can_account_kmem(memcg))

3471

if (!memcg_can_account_kmem(memcg))

3477

goto out;

3472

goto out;

3478

3473

3479

memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));

3474

memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));

3480

if (likely(memcg_cachep)) {

3475

if (likely(memcg_cachep)) {

3481

cachep = memcg_cachep;

3476

cachep = memcg_cachep;

3482

goto out;

3477

goto out;

3483

}

3478

}

3484

3479

3485

/* The corresponding put will be done in the workqueue. */

3480

/* The corresponding put will be done in the workqueue. */

3486

if (!css_tryget(&memcg->css))

3481

if (!css_tryget(&memcg->css))

3487

goto out;

3482

goto out;

3488

rcu_read_unlock();

3483

rcu_read_unlock();

3489

3484

3490

/*

3485

/*

3491

* If we are in a safe context (can wait, and not in interrupt

3486

* If we are in a safe context (can wait, and not in interrupt

3492

* context), we could be be predictable and return right away.

3487

* context), we could be be predictable and return right away.

3493

* This would guarantee that the allocation being performed

3488

* This would guarantee that the allocation being performed

3494

* already belongs in the new cache.

3489

* already belongs in the new cache.

3495

*

3490

*

3496

* However, there are some clashes that can arrive from locking.

3491

* However, there are some clashes that can arrive from locking.

3497

* For instance, because we acquire the slab_mutex while doing

3492

* For instance, because we acquire the slab_mutex while doing

3498

* kmem_cache_dup, this means no further allocation could happen

3493

* kmem_cache_dup, this means no further allocation could happen

3499

* with the slab_mutex held.

3494

* with the slab_mutex held.

3500

*

3495

*

3501

* Also, because cache creation issue get_online_cpus(), this

3496

* Also, because cache creation issue get_online_cpus(), this

3502

* creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,

3497

* creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,

3503

* that ends up reversed during cpu hotplug. (cpuset allocates

3498

* that ends up reversed during cpu hotplug. (cpuset allocates

3504

* a bunch of GFP_KERNEL memory during cpuup). Due to all that,

3499

* a bunch of GFP_KERNEL memory during cpuup). Due to all that,

3505

* better to defer everything.

3500

* better to defer everything.

3506

*/

3501

*/

3507

memcg_create_cache_enqueue(memcg, cachep);

3502

memcg_create_cache_enqueue(memcg, cachep);

3508

return cachep;

3503

return cachep;

3509

out:

3504

out:

3510

rcu_read_unlock();

3505

rcu_read_unlock();

3511

return cachep;

3506

return cachep;

3512

}

3507

}

3513

EXPORT_SYMBOL(__memcg_kmem_get_cache);

3508

EXPORT_SYMBOL(__memcg_kmem_get_cache);

3514

3509

3515

/*

3510

/*

3516

* We need to verify if the allocation against current->mm->owner's memcg is

3511

* We need to verify if the allocation against current->mm->owner's memcg is

3517

* possible for the given order. But the page is not allocated yet, so we'll

3512

* possible for the given order. But the page is not allocated yet, so we'll

3518

* need a further commit step to do the final arrangements.

3513

* need a further commit step to do the final arrangements.

3519

*

3514

*

3520

* It is possible for the task to switch cgroups in this mean time, so at

3515

* It is possible for the task to switch cgroups in this mean time, so at

3521

* commit time, we can't rely on task conversion any longer. We'll then use

3516

* commit time, we can't rely on task conversion any longer. We'll then use

3522

* the handle argument to return to the caller which cgroup we should commit

3517

* the handle argument to return to the caller which cgroup we should commit

3523

* against. We could also return the memcg directly and avoid the pointer

3518

* against. We could also return the memcg directly and avoid the pointer

3524

* passing, but a boolean return value gives better semantics considering

3519

* passing, but a boolean return value gives better semantics considering

3525

* the compiled-out case as well.

3520

* the compiled-out case as well.

3526

*

3521

*

3527

* Returning true means the allocation is possible.

3522

* Returning true means the allocation is possible.

3528

*/

3523

*/

3529

bool

3524

bool

3530

__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)

3525

__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)

3531

{

3526

{

3532

struct mem_cgroup *memcg;

3527

struct mem_cgroup *memcg;

3533

int ret;

3528

int ret;

3534

3529

3535

*_memcg = NULL;

3530

*_memcg = NULL;

3536

3531

3537

/*

3532

/*

3538

* Disabling accounting is only relevant for some specific memcg

3533

* Disabling accounting is only relevant for some specific memcg

3539

* internal allocations. Therefore we would initially not have such

3534

* internal allocations. Therefore we would initially not have such

3540

* check here, since direct calls to the page allocator that are marked

3535

* check here, since direct calls to the page allocator that are marked

3541

* with GFP_KMEMCG only happen outside memcg core. We are mostly

3536

* with GFP_KMEMCG only happen outside memcg core. We are mostly

3542

* concerned with cache allocations, and by having this test at

3537

* concerned with cache allocations, and by having this test at

3543

* memcg_kmem_get_cache, we are already able to relay the allocation to

3538

* memcg_kmem_get_cache, we are already able to relay the allocation to

3544

* the root cache and bypass the memcg cache altogether.

3539

* the root cache and bypass the memcg cache altogether.

3545

*

3540

*

3546

* There is one exception, though: the SLUB allocator does not create

3541

* There is one exception, though: the SLUB allocator does not create

3547

* large order caches, but rather service large kmallocs directly from

3542

* large order caches, but rather service large kmallocs directly from

3548

* the page allocator. Therefore, the following sequence when backed by

3543

* the page allocator. Therefore, the following sequence when backed by

3549

* the SLUB allocator:

3544

* the SLUB allocator:

3550

*

3545

*

3551

* memcg_stop_kmem_account();

3546

* memcg_stop_kmem_account();

3552

* kmalloc(<large_number>)

3547

* kmalloc(<large_number>)

3553

* memcg_resume_kmem_account();

3548

* memcg_resume_kmem_account();

3554

*

3549

*

3555

* would effectively ignore the fact that we should skip accounting,

3550

* would effectively ignore the fact that we should skip accounting,

3556

* since it will drive us directly to this function without passing

3551

* since it will drive us directly to this function without passing

3557

* through the cache selector memcg_kmem_get_cache. Such large

3552

* through the cache selector memcg_kmem_get_cache. Such large

3558

* allocations are extremely rare but can happen, for instance, for the

3553

* allocations are extremely rare but can happen, for instance, for the

3559

* cache arrays. We bring this test here.

3554

* cache arrays. We bring this test here.

3560

*/

3555

*/

3561

if (!current->mm || current->memcg_kmem_skip_account)

3556

if (!current->mm || current->memcg_kmem_skip_account)

3562

return true;

3557

return true;

3563

3558

3564

memcg = get_mem_cgroup_from_mm(current->mm);

3559

memcg = get_mem_cgroup_from_mm(current->mm);

3565

3560

3566

if (!memcg_can_account_kmem(memcg)) {

3561

if (!memcg_can_account_kmem(memcg)) {

3567

css_put(&memcg->css);

3562

css_put(&memcg->css);

3568

return true;

3563

return true;

3569

}

3564

}

3570

3565

3571

ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);

3566

ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);

3572

if (!ret)

3567

if (!ret)

3573

*_memcg = memcg;

3568

*_memcg = memcg;

3574

3569

3575

css_put(&memcg->css);

3570

css_put(&memcg->css);

3576

return (ret == 0);

3571

return (ret == 0);

3577

}

3572

}

3578

3573

3579

void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,

3574

void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,

3580

int order)

3575

int order)

3581

{

3576

{

3582

struct page_cgroup *pc;

3577

struct page_cgroup *pc;

3583

3578

3584

VM_BUG_ON(mem_cgroup_is_root(memcg));

3579

VM_BUG_ON(mem_cgroup_is_root(memcg));

3585

3580

3586

/* The page allocation failed. Revert */

3581

/* The page allocation failed. Revert */

3587

if (!page) {

3582

if (!page) {

3588

memcg_uncharge_kmem(memcg, PAGE_SIZE << order);

3583

memcg_uncharge_kmem(memcg, PAGE_SIZE << order);

3589

return;

3584

return;

3590

}

3585

}

3591

3586

3592

pc = lookup_page_cgroup(page);

3587

pc = lookup_page_cgroup(page);

3593

lock_page_cgroup(pc);

3588

lock_page_cgroup(pc);

3594

pc->mem_cgroup = memcg;

3589

pc->mem_cgroup = memcg;

3595

SetPageCgroupUsed(pc);

3590

SetPageCgroupUsed(pc);

3596

unlock_page_cgroup(pc);

3591

unlock_page_cgroup(pc);

3597

}

3592

}

3598

3593

3599

void __memcg_kmem_uncharge_pages(struct page *page, int order)

3594

void __memcg_kmem_uncharge_pages(struct page *page, int order)

3600

{

3595

{

3601

struct mem_cgroup *memcg = NULL;

3596

struct mem_cgroup *memcg = NULL;

3602

struct page_cgroup *pc;

3597

struct page_cgroup *pc;

3603

3598

3604

3599

3605

pc = lookup_page_cgroup(page);

3600

pc = lookup_page_cgroup(page);

3606

/*

3601

/*

3607

* Fast unlocked return. Theoretically might have changed, have to

3602

* Fast unlocked return. Theoretically might have changed, have to

3608

* check again after locking.

3603

* check again after locking.

3609

*/

3604

*/

3610

if (!PageCgroupUsed(pc))

3605

if (!PageCgroupUsed(pc))

3611

return;

3606

return;

3612

3607

3613

lock_page_cgroup(pc);

3608

lock_page_cgroup(pc);

3614

if (PageCgroupUsed(pc)) {

3609

if (PageCgroupUsed(pc)) {

3615

memcg = pc->mem_cgroup;

3610

memcg = pc->mem_cgroup;

3616

ClearPageCgroupUsed(pc);

3611

ClearPageCgroupUsed(pc);

3617

}

3612

}

3618

unlock_page_cgroup(pc);

3613

unlock_page_cgroup(pc);

3619

3614

3620

/*

3615

/*

3621

* We trust that only if there is a memcg associated with the page, it

3616

* We trust that only if there is a memcg associated with the page, it

3622

* is a valid allocation

3617

* is a valid allocation

3623

*/

3618

*/

3624

if (!memcg)

3619

if (!memcg)

3625

return;

3620

return;

3626

3621

3627

VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);

3622

VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);

3628

memcg_uncharge_kmem(memcg, PAGE_SIZE << order);

3623

memcg_uncharge_kmem(memcg, PAGE_SIZE << order);

3629

}

3624

}

3630

#else

3625

#else

3631

static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)

3626

static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)

3632

{

3627

{

3633

}

3628

}

3634

#endif /* CONFIG_MEMCG_KMEM */

3629

#endif /* CONFIG_MEMCG_KMEM */

3635

3630

3636

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

3631

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

3637

3632

3638

#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)

3633

#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)

3639

/*

3634

/*

3640

* Because tail pages are not marked as "used", set it. We're under

3635

* Because tail pages are not marked as "used", set it. We're under

3641

* zone->lru_lock, 'splitting on pmd' and compound_lock.

3636

* zone->lru_lock, 'splitting on pmd' and compound_lock.

3642

* charge/uncharge will be never happen and move_account() is done under

3637

* charge/uncharge will be never happen and move_account() is done under

3643

* compound_lock(), so we don't have to take care of races.

3638

* compound_lock(), so we don't have to take care of races.

3644

*/

3639

*/

3645

void mem_cgroup_split_huge_fixup(struct page *head)

3640

void mem_cgroup_split_huge_fixup(struct page *head)

3646

{

3641

{

3647

struct page_cgroup *head_pc = lookup_page_cgroup(head);

3642

struct page_cgroup *head_pc = lookup_page_cgroup(head);

3648

struct page_cgroup *pc;

3643

struct page_cgroup *pc;

3649

struct mem_cgroup *memcg;

3644

struct mem_cgroup *memcg;

3650

int i;

3645

int i;

3651

3646

3652

if (mem_cgroup_disabled())

3647

if (mem_cgroup_disabled())

3653

return;

3648

return;

3654

3649

3655

memcg = head_pc->mem_cgroup;

3650

memcg = head_pc->mem_cgroup;

3656

for (i = 1; i < HPAGE_PMD_NR; i++) {

3651

for (i = 1; i < HPAGE_PMD_NR; i++) {

3657

pc = head_pc + i;

3652

pc = head_pc + i;

3658

pc->mem_cgroup = memcg;

3653

pc->mem_cgroup = memcg;

3659

smp_wmb();/* see __commit_charge() */

3654

smp_wmb();/* see __commit_charge() */

3660

pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

3655

pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;

3661

}

3656

}

3662

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

3657

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

3663

HPAGE_PMD_NR);

3658

HPAGE_PMD_NR);

3664

}

3659

}

3665

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

3660

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

3666

3661

3667

/**

3662

/**

3668

* mem_cgroup_move_account - move account of the page

3663

* mem_cgroup_move_account - move account of the page

3669

* @page: the page

3664

* @page: the page

3670

* @nr_pages: number of regular pages (>1 for huge pages)

3665

* @nr_pages: number of regular pages (>1 for huge pages)

3671

* @pc: page_cgroup of the page.

3666

* @pc: page_cgroup of the page.

3672

* @from: mem_cgroup which the page is moved from.

3667

* @from: mem_cgroup which the page is moved from.

3673

* @to: mem_cgroup which the page is moved to. @from != @to.

3668

* @to: mem_cgroup which the page is moved to. @from != @to.

3674

*

3669

*

3675

* The caller must confirm following.

3670

* The caller must confirm following.

3676

* - page is not on LRU (isolate_page() is useful.)

3671

* - page is not on LRU (isolate_page() is useful.)

3677

* - compound_lock is held when nr_pages > 1

3672

* - compound_lock is held when nr_pages > 1

3678

*

3673

*

3679

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

3674

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

3680

* from old cgroup.

3675

* from old cgroup.

3681

*/

3676

*/

3682

static int mem_cgroup_move_account(struct page *page,

3677

static int mem_cgroup_move_account(struct page *page,

3683

unsigned int nr_pages,

3678

unsigned int nr_pages,

3684

struct page_cgroup *pc,

3679

struct page_cgroup *pc,

3685

struct mem_cgroup *from,

3680

struct mem_cgroup *from,

3686

struct mem_cgroup *to)

3681

struct mem_cgroup *to)

3687

{

3682

{

3688

unsigned long flags;

3683

unsigned long flags;

3689

int ret;

3684

int ret;

3690

bool anon = PageAnon(page);

3685

bool anon = PageAnon(page);

3691

3686

3692

VM_BUG_ON(from == to);

3687

VM_BUG_ON(from == to);

3693

VM_BUG_ON_PAGE(PageLRU(page), page);

3688

VM_BUG_ON_PAGE(PageLRU(page), page);

3694

/*

3689

/*

3695

* The page is isolated from LRU. So, collapse function

3690

* The page is isolated from LRU. So, collapse function

3696

* will not handle this page. But page splitting can happen.

3691

* will not handle this page. But page splitting can happen.

3697

* Do this check under compound_page_lock(). The caller should

3692

* Do this check under compound_page_lock(). The caller should

3698

* hold it.

3693

* hold it.

3699

*/

3694

*/

3700

ret = -EBUSY;

3695

ret = -EBUSY;

3701

if (nr_pages > 1 && !PageTransHuge(page))

3696

if (nr_pages > 1 && !PageTransHuge(page))

3702

goto out;

3697

goto out;

3703

3698

3704

lock_page_cgroup(pc);

3699

lock_page_cgroup(pc);

3705

3700

3706

ret = -EINVAL;

3701

ret = -EINVAL;

3707

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

3702

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

3708

goto unlock;

3703

goto unlock;

3709

3704

3710

move_lock_mem_cgroup(from, &flags);

3705

move_lock_mem_cgroup(from, &flags);

3711

3706

3712

if (!anon && page_mapped(page)) {

3707

if (!anon && page_mapped(page)) {

3713

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3708

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3714

nr_pages);

3709

nr_pages);

3715

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3710

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3716

nr_pages);

3711

nr_pages);

3717

}

3712

}

3718

3713

3719

if (PageWriteback(page)) {

3714

if (PageWriteback(page)) {

3720

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3715

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3721

nr_pages);

3716

nr_pages);

3722

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3717

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3723

nr_pages);

3718

nr_pages);

3724

}

3719

}

3725

3720

3726

mem_cgroup_charge_statistics(from, page, anon, -nr_pages);

3721

mem_cgroup_charge_statistics(from, page, anon, -nr_pages);

3727

3722

3728

/* caller should have done css_get */

3723

/* caller should have done css_get */

3729

pc->mem_cgroup = to;

3724

pc->mem_cgroup = to;

3730

mem_cgroup_charge_statistics(to, page, anon, nr_pages);

3725

mem_cgroup_charge_statistics(to, page, anon, nr_pages);

3731

move_unlock_mem_cgroup(from, &flags);

3726

move_unlock_mem_cgroup(from, &flags);

3732

ret = 0;

3727

ret = 0;

3733

unlock:

3728

unlock:

3734

unlock_page_cgroup(pc);

3729

unlock_page_cgroup(pc);

3735

/*

3730

/*

3736

* check events

3731

* check events

3737

*/

3732

*/

3738

memcg_check_events(to, page);

3733

memcg_check_events(to, page);

3739

memcg_check_events(from, page);

3734

memcg_check_events(from, page);

3740

out:

3735

out:

3741

return ret;

3736

return ret;

3742

}

3737

}

3743

3738

3744

/**

3739

/**

3745

* mem_cgroup_move_parent - moves page to the parent group

3740

* mem_cgroup_move_parent - moves page to the parent group

3746

* @page: the page to move

3741

* @page: the page to move

3747

* @pc: page_cgroup of the page

3742

* @pc: page_cgroup of the page

3748

* @child: page's cgroup

3743

* @child: page's cgroup

3749

*

3744

*

3750

* move charges to its parent or the root cgroup if the group has no

3745

* move charges to its parent or the root cgroup if the group has no

3751

* parent (aka use_hierarchy==0).

3746

* parent (aka use_hierarchy==0).

3752

* Although this might fail (get_page_unless_zero, isolate_lru_page or

3747

* Although this might fail (get_page_unless_zero, isolate_lru_page or

3753

* mem_cgroup_move_account fails) the failure is always temporary and

3748

* mem_cgroup_move_account fails) the failure is always temporary and

3754

* it signals a race with a page removal/uncharge or migration. In the

3749

* it signals a race with a page removal/uncharge or migration. In the

3755

* first case the page is on the way out and it will vanish from the LRU

3750

* first case the page is on the way out and it will vanish from the LRU

3756

* on the next attempt and the call should be retried later.

3751

* on the next attempt and the call should be retried later.

3757

* Isolation from the LRU fails only if page has been isolated from

3752

* Isolation from the LRU fails only if page has been isolated from

3758

* the LRU since we looked at it and that usually means either global

3753

* the LRU since we looked at it and that usually means either global

3759

* reclaim or migration going on. The page will either get back to the

3754

* reclaim or migration going on. The page will either get back to the

3760

* LRU or vanish.

3755

* LRU or vanish.

3761

* Finaly mem_cgroup_move_account fails only if the page got uncharged

3756

* Finaly mem_cgroup_move_account fails only if the page got uncharged

3762

* (!PageCgroupUsed) or moved to a different group. The page will

3757

* (!PageCgroupUsed) or moved to a different group. The page will

3763

* disappear in the next attempt.

3758

* disappear in the next attempt.

3764

*/

3759

*/

3765

static int mem_cgroup_move_parent(struct page *page,

3760

static int mem_cgroup_move_parent(struct page *page,

3766

struct page_cgroup *pc,

3761

struct page_cgroup *pc,

3767

struct mem_cgroup *child)

3762

struct mem_cgroup *child)

3768

{

3763

{

3769

struct mem_cgroup *parent;

3764

struct mem_cgroup *parent;

3770

unsigned int nr_pages;

3765

unsigned int nr_pages;

3771

unsigned long uninitialized_var(flags);

3766

unsigned long uninitialized_var(flags);

3772

int ret;

3767

int ret;

3773

3768

3774

VM_BUG_ON(mem_cgroup_is_root(child));

3769

VM_BUG_ON(mem_cgroup_is_root(child));

3775

3770

3776

ret = -EBUSY;

3771

ret = -EBUSY;

3777

if (!get_page_unless_zero(page))

3772

if (!get_page_unless_zero(page))

3778

goto out;

3773

goto out;

3779

if (isolate_lru_page(page))

3774

if (isolate_lru_page(page))

3780

goto put;

3775

goto put;

3781

3776

3782

nr_pages = hpage_nr_pages(page);

3777

nr_pages = hpage_nr_pages(page);

3783

3778

3784

parent = parent_mem_cgroup(child);

3779

parent = parent_mem_cgroup(child);

3785

/*

3780

/*

3786

* If no parent, move charges to root cgroup.

3781

* If no parent, move charges to root cgroup.

3787

*/

3782

*/

3788

if (!parent)

3783

if (!parent)

3789

parent = root_mem_cgroup;

3784

parent = root_mem_cgroup;

3790

3785

3791

if (nr_pages > 1) {

3786

if (nr_pages > 1) {

3792

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

3787

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

3793

flags = compound_lock_irqsave(page);

3788

flags = compound_lock_irqsave(page);

3794

}

3789

}

3795

3790

3796

ret = mem_cgroup_move_account(page, nr_pages,

3791

ret = mem_cgroup_move_account(page, nr_pages,

3797

pc, child, parent);

3792

pc, child, parent);

3798

if (!ret)

3793

if (!ret)

3799

__mem_cgroup_cancel_local_charge(child, nr_pages);

3794

__mem_cgroup_cancel_local_charge(child, nr_pages);

3800

3795

3801

if (nr_pages > 1)

3796

if (nr_pages > 1)

3802

compound_unlock_irqrestore(page, flags);

3797

compound_unlock_irqrestore(page, flags);

3803

putback_lru_page(page);

3798

putback_lru_page(page);

3804

put:

3799

put:

3805

put_page(page);

3800

put_page(page);

3806

out:

3801

out:

3807

return ret;

3802

return ret;

3808

}

3803

}

3809

3804

3810

int mem_cgroup_charge_anon(struct page *page,

3805

int mem_cgroup_charge_anon(struct page *page,

3811

struct mm_struct *mm, gfp_t gfp_mask)

3806

struct mm_struct *mm, gfp_t gfp_mask)

3812

{

3807

{

3813

unsigned int nr_pages = 1;

3808

unsigned int nr_pages = 1;

3814

struct mem_cgroup *memcg;

3809

struct mem_cgroup *memcg;

3815

bool oom = true;

3810

bool oom = true;

3816

3811

3817

if (mem_cgroup_disabled())

3812

if (mem_cgroup_disabled())

3818

return 0;

3813

return 0;

3819

3814

3820

VM_BUG_ON_PAGE(page_mapped(page), page);

3815

VM_BUG_ON_PAGE(page_mapped(page), page);

3821

VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);

3816

VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);

3822

VM_BUG_ON(!mm);

3817

VM_BUG_ON(!mm);

3823

3818

3824

if (PageTransHuge(page)) {

3819

if (PageTransHuge(page)) {

3825

nr_pages <<= compound_order(page);

3820

nr_pages <<= compound_order(page);

3826

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

3821

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

3827

/*

3822

/*

3828

* Never OOM-kill a process for a huge page. The

3823

* Never OOM-kill a process for a huge page. The

3829

* fault handler will fall back to regular pages.

3824

* fault handler will fall back to regular pages.

3830

*/

3825

*/

3831

oom = false;

3826

oom = false;

3832

}

3827

}

3833

3828

3834

memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);

3829

memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);

3835

if (!memcg)

3830

if (!memcg)

3836

return -ENOMEM;

3831

return -ENOMEM;

3837

__mem_cgroup_commit_charge(memcg, page, nr_pages,

3832

__mem_cgroup_commit_charge(memcg, page, nr_pages,

3838

MEM_CGROUP_CHARGE_TYPE_ANON, false);

3833

MEM_CGROUP_CHARGE_TYPE_ANON, false);

3839

return 0;

3834

return 0;

3840

}

3835

}

3841

3836

3842

/*

3837

/*

3843

* While swap-in, try_charge -> commit or cancel, the page is locked.

3838

* While swap-in, try_charge -> commit or cancel, the page is locked.

3844

* And when try_charge() successfully returns, one refcnt to memcg without

3839

* And when try_charge() successfully returns, one refcnt to memcg without

3845

* struct page_cgroup is acquired. This refcnt will be consumed by

3840

* struct page_cgroup is acquired. This refcnt will be consumed by

3846

* "commit()" or removed by "cancel()"

3841

* "commit()" or removed by "cancel()"

3847

*/

3842

*/

3848

static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,

3843

static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,

3849

struct page *page,

3844

struct page *page,

3850

gfp_t mask,

3845

gfp_t mask,

3851

struct mem_cgroup **memcgp)

3846

struct mem_cgroup **memcgp)

3852

{

3847

{

3853

struct mem_cgroup *memcg = NULL;

3848

struct mem_cgroup *memcg = NULL;

3854

struct page_cgroup *pc;

3849

struct page_cgroup *pc;

3855

int ret;

3850

int ret;

3856

3851

3857

pc = lookup_page_cgroup(page);

3852

pc = lookup_page_cgroup(page);

3858

/*

3853

/*

3859

* Every swap fault against a single page tries to charge the

3854

* Every swap fault against a single page tries to charge the

3860

* page, bail as early as possible. shmem_unuse() encounters

3855

* page, bail as early as possible. shmem_unuse() encounters

3861

* already charged pages, too. The USED bit is protected by

3856

* already charged pages, too. The USED bit is protected by

3862

* the page lock, which serializes swap cache removal, which

3857

* the page lock, which serializes swap cache removal, which

3863

* in turn serializes uncharging.

3858

* in turn serializes uncharging.

3864

*/

3859

*/

3865

if (PageCgroupUsed(pc))

3860

if (PageCgroupUsed(pc))

3866

goto out;

3861

goto out;

3867

if (do_swap_account)

3862

if (do_swap_account)

3868

memcg = try_get_mem_cgroup_from_page(page);

3863

memcg = try_get_mem_cgroup_from_page(page);

3869

if (!memcg)

3864

if (!memcg)

3870

memcg = get_mem_cgroup_from_mm(mm);

3865

memcg = get_mem_cgroup_from_mm(mm);

3871

ret = mem_cgroup_try_charge(memcg, mask, 1, true);

3866

ret = mem_cgroup_try_charge(memcg, mask, 1, true);

3872

css_put(&memcg->css);

3867

css_put(&memcg->css);

3873

if (ret == -EINTR)

3868

if (ret == -EINTR)

3874

memcg = root_mem_cgroup;

3869

memcg = root_mem_cgroup;

3875

else if (ret)

3870

else if (ret)

3876

return ret;

3871

return ret;

3877

out:

3872

out:

3878

*memcgp = memcg;

3873

*memcgp = memcg;

3879

return 0;

3874

return 0;

3880

}

3875

}

3881

3876

3882

int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,

3877

int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,

3883

gfp_t gfp_mask, struct mem_cgroup **memcgp)

3878

gfp_t gfp_mask, struct mem_cgroup **memcgp)

3884

{

3879

{

3885

if (mem_cgroup_disabled()) {

3880

if (mem_cgroup_disabled()) {

3886

*memcgp = NULL;

3881

*memcgp = NULL;

3887

return 0;

3882

return 0;

3888

}

3883

}

3889

/*

3884

/*

3890

* A racing thread's fault, or swapoff, may have already

3885

* A racing thread's fault, or swapoff, may have already

3891

* updated the pte, and even removed page from swap cache: in

3886

* updated the pte, and even removed page from swap cache: in

3892

* those cases unuse_pte()'s pte_same() test will fail; but

3887

* those cases unuse_pte()'s pte_same() test will fail; but

3893

* there's also a KSM case which does need to charge the page.

3888

* there's also a KSM case which does need to charge the page.

3894

*/

3889

*/

3895

if (!PageSwapCache(page)) {

3890

if (!PageSwapCache(page)) {

3896

struct mem_cgroup *memcg;

3891

struct mem_cgroup *memcg;

3897

3892

3898

memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);

3893

memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);

3899

if (!memcg)

3894

if (!memcg)

3900

return -ENOMEM;

3895

return -ENOMEM;

3901

*memcgp = memcg;

3896

*memcgp = memcg;

3902

return 0;

3897

return 0;

3903

}

3898

}

3904

return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);

3899

return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);

3905

}

3900

}

3906

3901

3907

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)

3902

void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)

3908

{

3903

{

3909

if (mem_cgroup_disabled())

3904

if (mem_cgroup_disabled())

3910

return;

3905

return;

3911

if (!memcg)

3906

if (!memcg)

3912

return;

3907

return;

3913

__mem_cgroup_cancel_charge(memcg, 1);

3908

__mem_cgroup_cancel_charge(memcg, 1);

3914

}

3909

}

3915

3910

3916

static void

3911

static void

3917

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,

3912

__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,

3918

enum charge_type ctype)

3913

enum charge_type ctype)

3919

{

3914

{

3920

if (mem_cgroup_disabled())

3915

if (mem_cgroup_disabled())

3921

return;

3916

return;

3922

if (!memcg)

3917

if (!memcg)

3923

return;

3918

return;

3924

3919

3925

__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);

3920

__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);

3926

/*

3921

/*

3927

* Now swap is on-memory. This means this page may be

3922

* Now swap is on-memory. This means this page may be

3928

* counted both as mem and swap....double count.

3923

* counted both as mem and swap....double count.

3929

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

3924

* Fix it by uncharging from memsw. Basically, this SwapCache is stable

3930

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

3925

* under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()

3931

* may call delete_from_swap_cache() before reach here.

3926

* may call delete_from_swap_cache() before reach here.

3932

*/

3927

*/

3933

if (do_swap_account && PageSwapCache(page)) {

3928

if (do_swap_account && PageSwapCache(page)) {

3934

swp_entry_t ent = {.val = page_private(page)};

3929

swp_entry_t ent = {.val = page_private(page)};

3935

mem_cgroup_uncharge_swap(ent);

3930

mem_cgroup_uncharge_swap(ent);

3936

}

3931

}

3937

}

3932

}

3938

3933

3939

void mem_cgroup_commit_charge_swapin(struct page *page,

3934

void mem_cgroup_commit_charge_swapin(struct page *page,

3940

struct mem_cgroup *memcg)

3935

struct mem_cgroup *memcg)

3941

{

3936

{

3942

__mem_cgroup_commit_charge_swapin(page, memcg,

3937

__mem_cgroup_commit_charge_swapin(page, memcg,

3943

MEM_CGROUP_CHARGE_TYPE_ANON);

3938

MEM_CGROUP_CHARGE_TYPE_ANON);

3944

}

3939

}

3945

3940

3946

int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,

3941

int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,

3947

gfp_t gfp_mask)

3942

gfp_t gfp_mask)

3948

{

3943

{

3949

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

3944

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

3950

struct mem_cgroup *memcg;

3945

struct mem_cgroup *memcg;

3951

int ret;

3946

int ret;

3952

3947

3953

if (mem_cgroup_disabled())

3948

if (mem_cgroup_disabled())

3954

return 0;

3949

return 0;

3955

if (PageCompound(page))

3950

if (PageCompound(page))

3956

return 0;

3951

return 0;

3957

3952

3958

if (PageSwapCache(page)) { /* shmem */

3953

if (PageSwapCache(page)) { /* shmem */

3959

ret = __mem_cgroup_try_charge_swapin(mm, page,

3954

ret = __mem_cgroup_try_charge_swapin(mm, page,

3960

gfp_mask, &memcg);

3955

gfp_mask, &memcg);

3961

if (ret)

3956

if (ret)

3962

return ret;

3957

return ret;

3963

__mem_cgroup_commit_charge_swapin(page, memcg, type);

3958

__mem_cgroup_commit_charge_swapin(page, memcg, type);

3964

return 0;

3959

return 0;

3965

}

3960

}

3966

3961

3967

/*

3962

/*

3968

* Page cache insertions can happen without an actual mm

3963

* Page cache insertions can happen without an actual mm

3969

* context, e.g. during disk probing on boot.

3964

* context, e.g. during disk probing on boot.

3970

*/

3965

*/

3971

if (unlikely(!mm))

3966

if (unlikely(!mm))

3972

memcg = root_mem_cgroup;

3967

memcg = root_mem_cgroup;

3973

else {

3968

else {

3974

memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);

3969

memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);

3975

if (!memcg)

3970

if (!memcg)

3976

return -ENOMEM;

3971

return -ENOMEM;

3977

}

3972

}

3978

__mem_cgroup_commit_charge(memcg, page, 1, type, false);

3973

__mem_cgroup_commit_charge(memcg, page, 1, type, false);

3979

return 0;

3974

return 0;

3980

}

3975

}

3981

3976

3982

static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,

3977

static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,

3983

unsigned int nr_pages,

3978

unsigned int nr_pages,

3984

const enum charge_type ctype)

3979

const enum charge_type ctype)

3985

{

3980

{

3986

struct memcg_batch_info *batch = NULL;

3981

struct memcg_batch_info *batch = NULL;

3987

bool uncharge_memsw = true;

3982

bool uncharge_memsw = true;

3988

3983

3989

/* If swapout, usage of swap doesn't decrease */

3984

/* If swapout, usage of swap doesn't decrease */

3990

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

3985

if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)

3991

uncharge_memsw = false;

3986

uncharge_memsw = false;

3992

3987

3993

batch = &current->memcg_batch;

3988

batch = &current->memcg_batch;

3994

/*

3989

/*

3995

* In usual, we do css_get() when we remember memcg pointer.

3990

* In usual, we do css_get() when we remember memcg pointer.

3996

* But in this case, we keep res->usage until end of a series of

3991

* But in this case, we keep res->usage until end of a series of

3997

* uncharges. Then, it's ok to ignore memcg's refcnt.

3992

* uncharges. Then, it's ok to ignore memcg's refcnt.

3998

*/

3993

*/

3999

if (!batch->memcg)

3994

if (!batch->memcg)

4000

batch->memcg = memcg;

3995

batch->memcg = memcg;

4001

/*

3996

/*

4002

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

3997

* do_batch > 0 when unmapping pages or inode invalidate/truncate.

4003

* In those cases, all pages freed continuously can be expected to be in

3998

* In those cases, all pages freed continuously can be expected to be in

4004

* the same cgroup and we have chance to coalesce uncharges.

3999

* the same cgroup and we have chance to coalesce uncharges.

4005

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

4000

* But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)

4006

* because we want to do uncharge as soon as possible.

4001

* because we want to do uncharge as soon as possible.

4007

*/

4002

*/

4008

4003

4009

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

4004

if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))

4010

goto direct_uncharge;

4005

goto direct_uncharge;

4011

4006

4012

if (nr_pages > 1)

4007

if (nr_pages > 1)

4013

goto direct_uncharge;

4008

goto direct_uncharge;

4014

4009

4015

/*

4010

/*

4016

* In typical case, batch->memcg == mem. This means we can

4011

* In typical case, batch->memcg == mem. This means we can

4017

* merge a series of uncharges to an uncharge of res_counter.

4012

* merge a series of uncharges to an uncharge of res_counter.

4018

* If not, we uncharge res_counter ony by one.

4013

* If not, we uncharge res_counter ony by one.

4019

*/

4014

*/

4020

if (batch->memcg != memcg)

4015

if (batch->memcg != memcg)

4021

goto direct_uncharge;

4016

goto direct_uncharge;

4022

/* remember freed charge and uncharge it later */

4017

/* remember freed charge and uncharge it later */

4023

batch->nr_pages++;

4018

batch->nr_pages++;

4024

if (uncharge_memsw)

4019

if (uncharge_memsw)

4025

batch->memsw_nr_pages++;

4020

batch->memsw_nr_pages++;

4026

return;

4021

return;

4027

direct_uncharge:

4022

direct_uncharge:

4028

res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);

4023

res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);

4029

if (uncharge_memsw)

4024

if (uncharge_memsw)

4030

res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);

4025

res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);

4031

if (unlikely(batch->memcg != memcg))

4026

if (unlikely(batch->memcg != memcg))

4032

memcg_oom_recover(memcg);

4027

memcg_oom_recover(memcg);

4033

}

4028

}

4034

4029

4035

/*

4030

/*

4036

* uncharge if !page_mapped(page)

4031

* uncharge if !page_mapped(page)

4037

*/

4032

*/

4038

static struct mem_cgroup *

4033

static struct mem_cgroup *

4039

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,

4034

__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,

4040

bool end_migration)

4035

bool end_migration)

4041

{

4036

{

4042

struct mem_cgroup *memcg = NULL;

4037

struct mem_cgroup *memcg = NULL;

4043

unsigned int nr_pages = 1;

4038

unsigned int nr_pages = 1;

4044

struct page_cgroup *pc;

4039

struct page_cgroup *pc;

4045

bool anon;

4040

bool anon;

4046

4041

4047

if (mem_cgroup_disabled())

4042

if (mem_cgroup_disabled())

4048

return NULL;

4043

return NULL;

4049

4044

4050

if (PageTransHuge(page)) {

4045

if (PageTransHuge(page)) {

4051

nr_pages <<= compound_order(page);

4046

nr_pages <<= compound_order(page);

4052

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

4047

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

4053

}

4048

}

4054

/*

4049

/*

4055

* Check if our page_cgroup is valid

4050

* Check if our page_cgroup is valid

4056

*/

4051

*/

4057

pc = lookup_page_cgroup(page);

4052

pc = lookup_page_cgroup(page);

4058

if (unlikely(!PageCgroupUsed(pc)))

4053

if (unlikely(!PageCgroupUsed(pc)))

4059

return NULL;

4054

return NULL;

4060

4055

4061

lock_page_cgroup(pc);

4056

lock_page_cgroup(pc);

4062

4057

4063

memcg = pc->mem_cgroup;

4058

memcg = pc->mem_cgroup;

4064

4059

4065

if (!PageCgroupUsed(pc))

4060

if (!PageCgroupUsed(pc))

4066

goto unlock_out;

4061

goto unlock_out;

4067

4062

4068

anon = PageAnon(page);

4063

anon = PageAnon(page);

4069

4064

4070

switch (ctype) {

4065

switch (ctype) {

4071

case MEM_CGROUP_CHARGE_TYPE_ANON:

4066

case MEM_CGROUP_CHARGE_TYPE_ANON:

4072

/*

4067

/*

4073

* Generally PageAnon tells if it's the anon statistics to be

4068

* Generally PageAnon tells if it's the anon statistics to be

4074

* updated; but sometimes e.g. mem_cgroup_uncharge_page() is

4069

* updated; but sometimes e.g. mem_cgroup_uncharge_page() is

4075

* used before page reached the stage of being marked PageAnon.

4070

* used before page reached the stage of being marked PageAnon.

4076

*/

4071

*/

4077

anon = true;

4072

anon = true;

4078

/* fallthrough */

4073

/* fallthrough */

4079

case MEM_CGROUP_CHARGE_TYPE_DROP:

4074

case MEM_CGROUP_CHARGE_TYPE_DROP:

4080

/* See mem_cgroup_prepare_migration() */

4075

/* See mem_cgroup_prepare_migration() */

4081

if (page_mapped(page))

4076

if (page_mapped(page))

4082

goto unlock_out;

4077

goto unlock_out;

4083

/*

4078

/*

4084

* Pages under migration may not be uncharged. But

4079

* Pages under migration may not be uncharged. But

4085

* end_migration() /must/ be the one uncharging the

4080

* end_migration() /must/ be the one uncharging the

4086

* unused post-migration page and so it has to call

4081

* unused post-migration page and so it has to call

4087

* here with the migration bit still set. See the

4082

* here with the migration bit still set. See the

4088

* res_counter handling below.

4083

* res_counter handling below.

4089

*/

4084

*/

4090

if (!end_migration && PageCgroupMigration(pc))

4085

if (!end_migration && PageCgroupMigration(pc))

4091

goto unlock_out;

4086

goto unlock_out;

4092

break;

4087

break;

4093

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

4088

case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:

4094

if (!PageAnon(page)) { /* Shared memory */

4089

if (!PageAnon(page)) { /* Shared memory */

4095

if (page->mapping && !page_is_file_cache(page))

4090

if (page->mapping && !page_is_file_cache(page))

4096

goto unlock_out;

4091

goto unlock_out;

4097

} else if (page_mapped(page)) /* Anon */

4092

} else if (page_mapped(page)) /* Anon */

4098

goto unlock_out;

4093

goto unlock_out;

4099

break;

4094

break;

4100

default:

4095

default:

4101

break;

4096

break;

4102

}

4097

}

4103

4098

4104

mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);

4099

mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);

4105

4100

4106

ClearPageCgroupUsed(pc);

4101

ClearPageCgroupUsed(pc);

4107

/*

4102

/*

4108

* pc->mem_cgroup is not cleared here. It will be accessed when it's

4103

* pc->mem_cgroup is not cleared here. It will be accessed when it's

4109

* freed from LRU. This is safe because uncharged page is expected not

4104

* freed from LRU. This is safe because uncharged page is expected not

4110

* to be reused (freed soon). Exception is SwapCache, it's handled by

4105

* to be reused (freed soon). Exception is SwapCache, it's handled by

4111

* special functions.

4106

* special functions.

4112

*/

4107

*/

4113

4108

4114

unlock_page_cgroup(pc);

4109

unlock_page_cgroup(pc);

4115

/*

4110

/*

4116

* even after unlock, we have memcg->res.usage here and this memcg

4111

* even after unlock, we have memcg->res.usage here and this memcg

4117

* will never be freed, so it's safe to call css_get().

4112

* will never be freed, so it's safe to call css_get().

4118

*/

4113

*/

4119

memcg_check_events(memcg, page);

4114

memcg_check_events(memcg, page);

4120

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

4115

if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {

4121

mem_cgroup_swap_statistics(memcg, true);

4116

mem_cgroup_swap_statistics(memcg, true);

4122

css_get(&memcg->css);

4117

css_get(&memcg->css);

4123

}

4118

}

4124

/*

4119

/*

4125

* Migration does not charge the res_counter for the

4120

* Migration does not charge the res_counter for the

4126

* replacement page, so leave it alone when phasing out the

4121

* replacement page, so leave it alone when phasing out the

4127

* page that is unused after the migration.

4122

* page that is unused after the migration.

4128

*/

4123

*/

4129

if (!end_migration && !mem_cgroup_is_root(memcg))

4124

if (!end_migration && !mem_cgroup_is_root(memcg))

4130

mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

4125

mem_cgroup_do_uncharge(memcg, nr_pages, ctype);

4131

4126

4132

return memcg;

4127

return memcg;

4133

4128

4134

unlock_out:

4129

unlock_out:

4135

unlock_page_cgroup(pc);

4130

unlock_page_cgroup(pc);

4136

return NULL;

4131

return NULL;

4137

}

4132

}

4138

4133

4139

void mem_cgroup_uncharge_page(struct page *page)

4134

void mem_cgroup_uncharge_page(struct page *page)

4140

{

4135

{

4141

/* early check. */

4136

/* early check. */

4142

if (page_mapped(page))

4137

if (page_mapped(page))

4143

return;

4138

return;

4144

VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);

4139

VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);

4145

/*

4140

/*

4146

* If the page is in swap cache, uncharge should be deferred

4141

* If the page is in swap cache, uncharge should be deferred

4147

* to the swap path, which also properly accounts swap usage

4142

* to the swap path, which also properly accounts swap usage

4148

* and handles memcg lifetime.

4143

* and handles memcg lifetime.

4149

*

4144

*

4150

* Note that this check is not stable and reclaim may add the

4145

* Note that this check is not stable and reclaim may add the

4151

* page to swap cache at any time after this. However, if the

4146

* page to swap cache at any time after this. However, if the

4152

* page is not in swap cache by the time page->mapcount hits

4147

* page is not in swap cache by the time page->mapcount hits

4153

* 0, there won't be any page table references to the swap

4148

* 0, there won't be any page table references to the swap

4154

* slot, and reclaim will free it and not actually write the

4149

* slot, and reclaim will free it and not actually write the

4155

* page to disk.

4150

* page to disk.

4156

*/

4151

*/

4157

if (PageSwapCache(page))

4152

if (PageSwapCache(page))

4158

return;

4153

return;

4159

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);

4154

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);

4160

}

4155

}

4161

4156

4162

void mem_cgroup_uncharge_cache_page(struct page *page)

4157

void mem_cgroup_uncharge_cache_page(struct page *page)

4163

{

4158

{

4164

VM_BUG_ON_PAGE(page_mapped(page), page);

4159

VM_BUG_ON_PAGE(page_mapped(page), page);

4165

VM_BUG_ON_PAGE(page->mapping, page);

4160

VM_BUG_ON_PAGE(page->mapping, page);

4166

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);

4161

__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);

4167

}

4162

}

4168

4163

4169

/*

4164

/*

4170

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

4165

* Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.

4171

* In that cases, pages are freed continuously and we can expect pages

4166

* In that cases, pages are freed continuously and we can expect pages

4172

* are in the same memcg. All these calls itself limits the number of

4167

* are in the same memcg. All these calls itself limits the number of

4173

* pages freed at once, then uncharge_start/end() is called properly.

4168

* pages freed at once, then uncharge_start/end() is called properly.

4174

* This may be called prural(2) times in a context,

4169

* This may be called prural(2) times in a context,

4175

*/

4170

*/

4176

4171

4177

void mem_cgroup_uncharge_start(void)

4172

void mem_cgroup_uncharge_start(void)

4178

{

4173

{

4179

current->memcg_batch.do_batch++;

4174

current->memcg_batch.do_batch++;

4180

/* We can do nest. */

4175

/* We can do nest. */

4181

if (current->memcg_batch.do_batch == 1) {

4176

if (current->memcg_batch.do_batch == 1) {

4182

current->memcg_batch.memcg = NULL;

4177

current->memcg_batch.memcg = NULL;

4183

current->memcg_batch.nr_pages = 0;

4178

current->memcg_batch.nr_pages = 0;

4184

current->memcg_batch.memsw_nr_pages = 0;

4179

current->memcg_batch.memsw_nr_pages = 0;

4185

}

4180

}

4186

}

4181

}

4187

4182

4188

void mem_cgroup_uncharge_end(void)

4183

void mem_cgroup_uncharge_end(void)

4189

{

4184

{

4190

struct memcg_batch_info *batch = &current->memcg_batch;

4185

struct memcg_batch_info *batch = &current->memcg_batch;

4191

4186

4192

if (!batch->do_batch)

4187

if (!batch->do_batch)

4193

return;

4188

return;

4194

4189

4195

batch->do_batch--;

4190

batch->do_batch--;

4196

if (batch->do_batch) /* If stacked, do nothing. */

4191

if (batch->do_batch) /* If stacked, do nothing. */

4197

return;

4192

return;

4198

4193

4199

if (!batch->memcg)

4194

if (!batch->memcg)

4200

return;

4195

return;

4201

/*

4196

/*

4202

* This "batch->memcg" is valid without any css_get/put etc...

4197

* This "batch->memcg" is valid without any css_get/put etc...

4203

* bacause we hide charges behind us.

4198

* bacause we hide charges behind us.

4204

*/

4199

*/

4205

if (batch->nr_pages)

4200

if (batch->nr_pages)

4206

res_counter_uncharge(&batch->memcg->res,

4201

res_counter_uncharge(&batch->memcg->res,

4207

batch->nr_pages * PAGE_SIZE);

4202

batch->nr_pages * PAGE_SIZE);

4208

if (batch->memsw_nr_pages)

4203

if (batch->memsw_nr_pages)

4209

res_counter_uncharge(&batch->memcg->memsw,

4204

res_counter_uncharge(&batch->memcg->memsw,

4210

batch->memsw_nr_pages * PAGE_SIZE);

4205

batch->memsw_nr_pages * PAGE_SIZE);

4211

memcg_oom_recover(batch->memcg);

4206

memcg_oom_recover(batch->memcg);

4212

/* forget this pointer (for sanity check) */

4207

/* forget this pointer (for sanity check) */

4213

batch->memcg = NULL;

4208

batch->memcg = NULL;

4214

}

4209

}

4215

4210

4216

#ifdef CONFIG_SWAP

4211

#ifdef CONFIG_SWAP

4217

/*

4212

/*

4218

* called after __delete_from_swap_cache() and drop "page" account.

4213

* called after __delete_from_swap_cache() and drop "page" account.

4219

* memcg information is recorded to swap_cgroup of "ent"

4214

* memcg information is recorded to swap_cgroup of "ent"

4220

*/

4215

*/

4221

void

4216

void

4222

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

4217

mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)

4223

{

4218

{

4224

struct mem_cgroup *memcg;

4219

struct mem_cgroup *memcg;

4225

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

4220

int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;

4226

4221

4227

if (!swapout) /* this was a swap cache but the swap is unused ! */

4222

if (!swapout) /* this was a swap cache but the swap is unused ! */

4228

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

4223

ctype = MEM_CGROUP_CHARGE_TYPE_DROP;

4229

4224

4230

memcg = __mem_cgroup_uncharge_common(page, ctype, false);

4225

memcg = __mem_cgroup_uncharge_common(page, ctype, false);

4231

4226

4232

/*

4227

/*

4233

* record memcg information, if swapout && memcg != NULL,

4228

* record memcg information, if swapout && memcg != NULL,

4234

* css_get() was called in uncharge().

4229

* css_get() was called in uncharge().

4235

*/

4230

*/

4236

if (do_swap_account && swapout && memcg)

4231

if (do_swap_account && swapout && memcg)

4237

swap_cgroup_record(ent, mem_cgroup_id(memcg));

4232

swap_cgroup_record(ent, mem_cgroup_id(memcg));

4238

}

4233

}

4239

#endif

4234

#endif

4240

4235

4241

#ifdef CONFIG_MEMCG_SWAP

4236

#ifdef CONFIG_MEMCG_SWAP

4242

/*

4237

/*

4243

* called from swap_entry_free(). remove record in swap_cgroup and

4238

* called from swap_entry_free(). remove record in swap_cgroup and

4244

* uncharge "memsw" account.

4239

* uncharge "memsw" account.

4245

*/

4240

*/

4246

void mem_cgroup_uncharge_swap(swp_entry_t ent)

4241

void mem_cgroup_uncharge_swap(swp_entry_t ent)

4247

{

4242

{

4248

struct mem_cgroup *memcg;

4243

struct mem_cgroup *memcg;

4249

unsigned short id;

4244

unsigned short id;

4250

4245

4251

if (!do_swap_account)

4246

if (!do_swap_account)

4252

return;

4247

return;

4253

4248

4254

id = swap_cgroup_record(ent, 0);

4249

id = swap_cgroup_record(ent, 0);

4255

rcu_read_lock();

4250

rcu_read_lock();

4256

memcg = mem_cgroup_lookup(id);

4251

memcg = mem_cgroup_lookup(id);

4257

if (memcg) {

4252

if (memcg) {

4258

/*

4253

/*

4259

* We uncharge this because swap is freed.

4254

* We uncharge this because swap is freed.

4260

* This memcg can be obsolete one. We avoid calling css_tryget

4255

* This memcg can be obsolete one. We avoid calling css_tryget

4261

*/

4256

*/

4262

if (!mem_cgroup_is_root(memcg))

4257

if (!mem_cgroup_is_root(memcg))

4263

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

4258

res_counter_uncharge(&memcg->memsw, PAGE_SIZE);

4264

mem_cgroup_swap_statistics(memcg, false);

4259

mem_cgroup_swap_statistics(memcg, false);

4265

css_put(&memcg->css);

4260

css_put(&memcg->css);

4266

}

4261

}

4267

rcu_read_unlock();

4262

rcu_read_unlock();

4268

}

4263

}

4269

4264

4270

/**

4265

/**

4271

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

4266

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

4272

* @entry: swap entry to be moved

4267

* @entry: swap entry to be moved

4273

* @from: mem_cgroup which the entry is moved from

4268

* @from: mem_cgroup which the entry is moved from

4274

* @to: mem_cgroup which the entry is moved to

4269

* @to: mem_cgroup which the entry is moved to

4275

*

4270

*

4276

* It succeeds only when the swap_cgroup's record for this entry is the same

4271

* It succeeds only when the swap_cgroup's record for this entry is the same

4277

* as the mem_cgroup's id of @from.

4272

* as the mem_cgroup's id of @from.

4278

*

4273

*

4279

* Returns 0 on success, -EINVAL on failure.

4274

* Returns 0 on success, -EINVAL on failure.

4280

*

4275

*

4281

* The caller must have charged to @to, IOW, called res_counter_charge() about

4276

* The caller must have charged to @to, IOW, called res_counter_charge() about

4282

* both res and memsw, and called css_get().

4277

* both res and memsw, and called css_get().

4283

*/

4278

*/

4284

static int mem_cgroup_move_swap_account(swp_entry_t entry,

4279

static int mem_cgroup_move_swap_account(swp_entry_t entry,

4285

struct mem_cgroup *from, struct mem_cgroup *to)

4280

struct mem_cgroup *from, struct mem_cgroup *to)

4286

{

4281

{

4287

unsigned short old_id, new_id;

4282

unsigned short old_id, new_id;

4288

4283

4289

old_id = mem_cgroup_id(from);

4284

old_id = mem_cgroup_id(from);

4290

new_id = mem_cgroup_id(to);

4285

new_id = mem_cgroup_id(to);

4291

4286

4292

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

4287

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

4293

mem_cgroup_swap_statistics(from, false);

4288

mem_cgroup_swap_statistics(from, false);

4294

mem_cgroup_swap_statistics(to, true);

4289

mem_cgroup_swap_statistics(to, true);

4295

/*

4290

/*

4296

* This function is only called from task migration context now.

4291

* This function is only called from task migration context now.

4297

* It postpones res_counter and refcount handling till the end

4292

* It postpones res_counter and refcount handling till the end

4298

* of task migration(mem_cgroup_clear_mc()) for performance

4293

* of task migration(mem_cgroup_clear_mc()) for performance

4299

* improvement. But we cannot postpone css_get(to) because if

4294

* improvement. But we cannot postpone css_get(to) because if

4300

* the process that has been moved to @to does swap-in, the

4295

* the process that has been moved to @to does swap-in, the

4301

* refcount of @to might be decreased to 0.

4296

* refcount of @to might be decreased to 0.

4302

*

4297

*

4303

* We are in attach() phase, so the cgroup is guaranteed to be

4298

* We are in attach() phase, so the cgroup is guaranteed to be

4304

* alive, so we can just call css_get().

4299

* alive, so we can just call css_get().

4305

*/

4300

*/

4306

css_get(&to->css);

4301

css_get(&to->css);

4307

return 0;

4302

return 0;

4308

}

4303

}

4309

return -EINVAL;

4304

return -EINVAL;

4310

}

4305

}

4311

#else

4306

#else

4312

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

4307

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

4313

struct mem_cgroup *from, struct mem_cgroup *to)

4308

struct mem_cgroup *from, struct mem_cgroup *to)

4314

{

4309

{

4315

return -EINVAL;

4310

return -EINVAL;

4316

}

4311

}

4317

#endif

4312

#endif

4318

4313

4319

/*

4314

/*

4320

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

4315

* Before starting migration, account PAGE_SIZE to mem_cgroup that the old

4321

* page belongs to.

4316

* page belongs to.

4322

*/

4317

*/

4323

void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,

4318

void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,

4324

struct mem_cgroup **memcgp)

4319

struct mem_cgroup **memcgp)

4325

{

4320

{

4326

struct mem_cgroup *memcg = NULL;

4321

struct mem_cgroup *memcg = NULL;

4327

unsigned int nr_pages = 1;

4322

unsigned int nr_pages = 1;

4328

struct page_cgroup *pc;

4323

struct page_cgroup *pc;

4329

enum charge_type ctype;

4324

enum charge_type ctype;

4330

4325

4331

*memcgp = NULL;

4326

*memcgp = NULL;

4332

4327

4333

if (mem_cgroup_disabled())

4328

if (mem_cgroup_disabled())

4334

return;

4329

return;

4335

4330

4336

if (PageTransHuge(page))

4331

if (PageTransHuge(page))

4337

nr_pages <<= compound_order(page);

4332

nr_pages <<= compound_order(page);

4338

4333

4339

pc = lookup_page_cgroup(page);

4334

pc = lookup_page_cgroup(page);

4340

lock_page_cgroup(pc);

4335

lock_page_cgroup(pc);

4341

if (PageCgroupUsed(pc)) {

4336

if (PageCgroupUsed(pc)) {

4342

memcg = pc->mem_cgroup;

4337

memcg = pc->mem_cgroup;

4343

css_get(&memcg->css);

4338

css_get(&memcg->css);

4344

/*

4339

/*

4345

* At migrating an anonymous page, its mapcount goes down

4340

* At migrating an anonymous page, its mapcount goes down

4346

* to 0 and uncharge() will be called. But, even if it's fully

4341

* to 0 and uncharge() will be called. But, even if it's fully

4347

* unmapped, migration may fail and this page has to be

4342

* unmapped, migration may fail and this page has to be

4348

* charged again. We set MIGRATION flag here and delay uncharge

4343

* charged again. We set MIGRATION flag here and delay uncharge

4349

* until end_migration() is called

4344

* until end_migration() is called

4350

*

4345

*

4351

* Corner Case Thinking

4346

* Corner Case Thinking

4352

* A)

4347

* A)

4353

* When the old page was mapped as Anon and it's unmap-and-freed

4348

* When the old page was mapped as Anon and it's unmap-and-freed

4354

* while migration was ongoing.

4349

* while migration was ongoing.

4355

* If unmap finds the old page, uncharge() of it will be delayed

4350

* If unmap finds the old page, uncharge() of it will be delayed

4356

* until end_migration(). If unmap finds a new page, it's

4351

* until end_migration(). If unmap finds a new page, it's

4357

* uncharged when it make mapcount to be 1->0. If unmap code

4352

* uncharged when it make mapcount to be 1->0. If unmap code

4358

* finds swap_migration_entry, the new page will not be mapped

4353

* finds swap_migration_entry, the new page will not be mapped

4359

* and end_migration() will find it(mapcount==0).

4354

* and end_migration() will find it(mapcount==0).

4360

*

4355

*

4361

* B)

4356

* B)

4362

* When the old page was mapped but migraion fails, the kernel

4357

* When the old page was mapped but migraion fails, the kernel

4363

* remaps it. A charge for it is kept by MIGRATION flag even

4358

* remaps it. A charge for it is kept by MIGRATION flag even

4364

* if mapcount goes down to 0. We can do remap successfully

4359

* if mapcount goes down to 0. We can do remap successfully

4365

* without charging it again.

4360

* without charging it again.

4366

*

4361

*

4367

* C)

4362

* C)

4368

* The "old" page is under lock_page() until the end of

4363

* The "old" page is under lock_page() until the end of

4369

* migration, so, the old page itself will not be swapped-out.

4364

* migration, so, the old page itself will not be swapped-out.

4370

* If the new page is swapped out before end_migraton, our

4365

* If the new page is swapped out before end_migraton, our

4371

* hook to usual swap-out path will catch the event.

4366

* hook to usual swap-out path will catch the event.

4372

*/

4367

*/

4373

if (PageAnon(page))

4368

if (PageAnon(page))

4374

SetPageCgroupMigration(pc);

4369

SetPageCgroupMigration(pc);

4375

}

4370

}

4376

unlock_page_cgroup(pc);

4371

unlock_page_cgroup(pc);

4377

/*

4372

/*

4378

* If the page is not charged at this point,

4373

* If the page is not charged at this point,

4379

* we return here.

4374

* we return here.

4380

*/

4375

*/

4381

if (!memcg)

4376

if (!memcg)

4382

return;

4377

return;

4383

4378

4384

*memcgp = memcg;

4379

*memcgp = memcg;

4385

/*

4380

/*

4386

* We charge new page before it's used/mapped. So, even if unlock_page()

4381

* We charge new page before it's used/mapped. So, even if unlock_page()

4387

* is called before end_migration, we can catch all events on this new

4382

* is called before end_migration, we can catch all events on this new

4388

* page. In the case new page is migrated but not remapped, new page's

4383

* page. In the case new page is migrated but not remapped, new page's

4389

* mapcount will be finally 0 and we call uncharge in end_migration().

4384

* mapcount will be finally 0 and we call uncharge in end_migration().

4390

*/

4385

*/

4391

if (PageAnon(page))

4386

if (PageAnon(page))

4392

ctype = MEM_CGROUP_CHARGE_TYPE_ANON;

4387

ctype = MEM_CGROUP_CHARGE_TYPE_ANON;

4393

else

4388

else

4394

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

4389

ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;

4395

/*

4390

/*

4396

* The page is committed to the memcg, but it's not actually

4391

* The page is committed to the memcg, but it's not actually

4397

* charged to the res_counter since we plan on replacing the

4392

* charged to the res_counter since we plan on replacing the

4398

* old one and only one page is going to be left afterwards.

4393

* old one and only one page is going to be left afterwards.

4399

*/

4394

*/

4400

__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);

4395

__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);

4401

}

4396

}

4402

4397

4403

/* remove redundant charge if migration failed*/

4398

/* remove redundant charge if migration failed*/

4404

void mem_cgroup_end_migration(struct mem_cgroup *memcg,

4399

void mem_cgroup_end_migration(struct mem_cgroup *memcg,

4405

struct page *oldpage, struct page *newpage, bool migration_ok)

4400

struct page *oldpage, struct page *newpage, bool migration_ok)

4406

{

4401

{

4407

struct page *used, *unused;

4402

struct page *used, *unused;

4408

struct page_cgroup *pc;

4403

struct page_cgroup *pc;

4409

bool anon;

4404

bool anon;

4410

4405

4411

if (!memcg)

4406

if (!memcg)

4412

return;

4407

return;

4413

4408

4414

if (!migration_ok) {

4409

if (!migration_ok) {

4415

used = oldpage;

4410

used = oldpage;

4416

unused = newpage;

4411

unused = newpage;

4417

} else {

4412

} else {

4418

used = newpage;

4413

used = newpage;

4419

unused = oldpage;

4414

unused = oldpage;

4420

}

4415

}

4421

anon = PageAnon(used);

4416

anon = PageAnon(used);

4422

__mem_cgroup_uncharge_common(unused,

4417

__mem_cgroup_uncharge_common(unused,

4423

anon ? MEM_CGROUP_CHARGE_TYPE_ANON

4418

anon ? MEM_CGROUP_CHARGE_TYPE_ANON

4424

: MEM_CGROUP_CHARGE_TYPE_CACHE,

4419

: MEM_CGROUP_CHARGE_TYPE_CACHE,

4425

true);

4420

true);

4426

css_put(&memcg->css);

4421

css_put(&memcg->css);

4427

/*

4422

/*

4428

* We disallowed uncharge of pages under migration because mapcount

4423

* We disallowed uncharge of pages under migration because mapcount

4429

* of the page goes down to zero, temporarly.

4424

* of the page goes down to zero, temporarly.

4430

* Clear the flag and check the page should be charged.

4425

* Clear the flag and check the page should be charged.

4431

*/

4426

*/

4432

pc = lookup_page_cgroup(oldpage);

4427

pc = lookup_page_cgroup(oldpage);

4433

lock_page_cgroup(pc);

4428

lock_page_cgroup(pc);

4434

ClearPageCgroupMigration(pc);

4429

ClearPageCgroupMigration(pc);

4435

unlock_page_cgroup(pc);

4430

unlock_page_cgroup(pc);

4436

4431

4437

/*

4432

/*

4438

* If a page is a file cache, radix-tree replacement is very atomic

4433

* If a page is a file cache, radix-tree replacement is very atomic

4439

* and we can skip this check. When it was an Anon page, its mapcount

4434

* and we can skip this check. When it was an Anon page, its mapcount

4440

* goes down to 0. But because we added MIGRATION flage, it's not

4435

* goes down to 0. But because we added MIGRATION flage, it's not

4441

* uncharged yet. There are several case but page->mapcount check

4436

* uncharged yet. There are several case but page->mapcount check

4442

* and USED bit check in mem_cgroup_uncharge_page() will do enough

4437

* and USED bit check in mem_cgroup_uncharge_page() will do enough

4443

* check. (see prepare_charge() also)

4438

* check. (see prepare_charge() also)

4444

*/

4439

*/

4445

if (anon)

4440

if (anon)

4446

mem_cgroup_uncharge_page(used);

4441

mem_cgroup_uncharge_page(used);

4447

}

4442

}

4448

4443

4449

/*

4444

/*

4450

* At replace page cache, newpage is not under any memcg but it's on

4445

* At replace page cache, newpage is not under any memcg but it's on

4451

* LRU. So, this function doesn't touch res_counter but handles LRU

4446

* LRU. So, this function doesn't touch res_counter but handles LRU

4452

* in correct way. Both pages are locked so we cannot race with uncharge.

4447

* in correct way. Both pages are locked so we cannot race with uncharge.

4453

*/

4448

*/

4454

void mem_cgroup_replace_page_cache(struct page *oldpage,

4449

void mem_cgroup_replace_page_cache(struct page *oldpage,

4455

struct page *newpage)

4450

struct page *newpage)

4456

{

4451

{

4457

struct mem_cgroup *memcg = NULL;

4452

struct mem_cgroup *memcg = NULL;

4458

struct page_cgroup *pc;

4453

struct page_cgroup *pc;

4459

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

4454

enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;

4460

4455

4461

if (mem_cgroup_disabled())

4456

if (mem_cgroup_disabled())

4462

return;

4457

return;

4463

4458

4464

pc = lookup_page_cgroup(oldpage);

4459

pc = lookup_page_cgroup(oldpage);

4465

/* fix accounting on old pages */

4460

/* fix accounting on old pages */

4466

lock_page_cgroup(pc);

4461

lock_page_cgroup(pc);

4467

if (PageCgroupUsed(pc)) {

4462

if (PageCgroupUsed(pc)) {

4468

memcg = pc->mem_cgroup;

4463

memcg = pc->mem_cgroup;

4469

mem_cgroup_charge_statistics(memcg, oldpage, false, -1);

4464

mem_cgroup_charge_statistics(memcg, oldpage, false, -1);

4470

ClearPageCgroupUsed(pc);

4465

ClearPageCgroupUsed(pc);

4471

}

4466

}

4472

unlock_page_cgroup(pc);

4467

unlock_page_cgroup(pc);

4473

4468

4474

/*

4469

/*

4475

* When called from shmem_replace_page(), in some cases the

4470

* When called from shmem_replace_page(), in some cases the

4476

* oldpage has already been charged, and in some cases not.

4471

* oldpage has already been charged, and in some cases not.

4477

*/

4472

*/

4478

if (!memcg)

4473

if (!memcg)

4479

return;

4474

return;

4480

/*

4475

/*

4481

* Even if newpage->mapping was NULL before starting replacement,

4476

* Even if newpage->mapping was NULL before starting replacement,

4482

* the newpage may be on LRU(or pagevec for LRU) already. We lock

4477

* the newpage may be on LRU(or pagevec for LRU) already. We lock

4483

* LRU while we overwrite pc->mem_cgroup.

4478

* LRU while we overwrite pc->mem_cgroup.

4484

*/

4479

*/

4485

__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);

4480

__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);

4486

}

4481

}

4487

4482

4488

#ifdef CONFIG_DEBUG_VM

4483

#ifdef CONFIG_DEBUG_VM

4489

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

4484

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

4490

{

4485

{

4491

struct page_cgroup *pc;

4486

struct page_cgroup *pc;

4492

4487

4493

pc = lookup_page_cgroup(page);

4488

pc = lookup_page_cgroup(page);

4494

/*

4489

/*

4495

* Can be NULL while feeding pages into the page allocator for

4490

* Can be NULL while feeding pages into the page allocator for

4496

* the first time, i.e. during boot or memory hotplug;

4491

* the first time, i.e. during boot or memory hotplug;

4497

* or when mem_cgroup_disabled().

4492

* or when mem_cgroup_disabled().

4498

*/

4493

*/

4499

if (likely(pc) && PageCgroupUsed(pc))

4494

if (likely(pc) && PageCgroupUsed(pc))

4500

return pc;

4495

return pc;

4501

return NULL;

4496

return NULL;

4502

}

4497

}

4503

4498

4504

bool mem_cgroup_bad_page_check(struct page *page)

4499

bool mem_cgroup_bad_page_check(struct page *page)

4505

{

4500

{

4506

if (mem_cgroup_disabled())

4501

if (mem_cgroup_disabled())

4507

return false;

4502

return false;

4508

4503

4509

return lookup_page_cgroup_used(page) != NULL;

4504

return lookup_page_cgroup_used(page) != NULL;

4510

}

4505

}

4511

4506

4512

void mem_cgroup_print_bad_page(struct page *page)

4507

void mem_cgroup_print_bad_page(struct page *page)

4513

{

4508

{

4514

struct page_cgroup *pc;

4509

struct page_cgroup *pc;

4515

4510

4516

pc = lookup_page_cgroup_used(page);

4511

pc = lookup_page_cgroup_used(page);

4517

if (pc) {

4512

if (pc) {

4518

pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

4513

pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

4519

pc, pc->flags, pc->mem_cgroup);

4514

pc, pc->flags, pc->mem_cgroup);

4520

}

4515

}

4521

}

4516

}

4522

#endif

4517

#endif

4523

4518

4524

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

4519

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

4525

unsigned long long val)

4520

unsigned long long val)

4526

{

4521

{

4527

int retry_count;

4522

int retry_count;

4528

u64 memswlimit, memlimit;

4523

u64 memswlimit, memlimit;

4529

int ret = 0;

4524

int ret = 0;

4530

int children = mem_cgroup_count_children(memcg);

4525

int children = mem_cgroup_count_children(memcg);

4531

u64 curusage, oldusage;

4526

u64 curusage, oldusage;

4532

int enlarge;

4527

int enlarge;

4533

4528

4534

/*

4529

/*

4535

* For keeping hierarchical_reclaim simple, how long we should retry

4530

* For keeping hierarchical_reclaim simple, how long we should retry

4536

* is depends on callers. We set our retry-count to be function

4531

* is depends on callers. We set our retry-count to be function

4537

* of # of children which we should visit in this loop.

4532

* of # of children which we should visit in this loop.

4538

*/

4533

*/

4539

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

4534

retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;

4540

4535

4541

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

4536

oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);

4542

4537

4543

enlarge = 0;

4538

enlarge = 0;

4544

while (retry_count) {

4539

while (retry_count) {

4545

if (signal_pending(current)) {

4540

if (signal_pending(current)) {

4546

ret = -EINTR;

4541

ret = -EINTR;

4547

break;

4542

break;

4548

}

4543

}

4549

/*

4544

/*

4550

* Rather than hide all in some function, I do this in

4545

* Rather than hide all in some function, I do this in

4551

* open coded manner. You see what this really does.

4546

* open coded manner. You see what this really does.

4552

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

4547

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

4553

*/

4548

*/

4554

mutex_lock(&set_limit_mutex);

4549

mutex_lock(&set_limit_mutex);

4555

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4550

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4556

if (memswlimit < val) {

4551

if (memswlimit < val) {

4557

ret = -EINVAL;

4552

ret = -EINVAL;

4558

mutex_unlock(&set_limit_mutex);

4553

mutex_unlock(&set_limit_mutex);

4559

break;

4554

break;

4560

}

4555

}

4561

4556

4562

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

4557

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

4563

if (memlimit < val)

4558

if (memlimit < val)

4564

enlarge = 1;

4559

enlarge = 1;

4565

4560

4566

ret = res_counter_set_limit(&memcg->res, val);

4561

ret = res_counter_set_limit(&memcg->res, val);

4567

if (!ret) {

4562

if (!ret) {

4568

if (memswlimit == val)

4563

if (memswlimit == val)

4569

memcg->memsw_is_minimum = true;

4564

memcg->memsw_is_minimum = true;

4570

else

4565

else

4571

memcg->memsw_is_minimum = false;

4566

memcg->memsw_is_minimum = false;

4572

}

4567

}

4573

mutex_unlock(&set_limit_mutex);

4568

mutex_unlock(&set_limit_mutex);

4574

4569

4575

if (!ret)

4570

if (!ret)

4576

break;

4571

break;

4577

4572

4578

mem_cgroup_reclaim(memcg, GFP_KERNEL,

4573

mem_cgroup_reclaim(memcg, GFP_KERNEL,

4579

MEM_CGROUP_RECLAIM_SHRINK);

4574

MEM_CGROUP_RECLAIM_SHRINK);

4580

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

4575

curusage = res_counter_read_u64(&memcg->res, RES_USAGE);

4581

/* Usage is reduced ? */

4576

/* Usage is reduced ? */

4582

if (curusage >= oldusage)

4577

if (curusage >= oldusage)

4583

retry_count--;

4578

retry_count--;

4584

else

4579

else

4585

oldusage = curusage;

4580

oldusage = curusage;

4586

}

4581

}

4587

if (!ret && enlarge)

4582

if (!ret && enlarge)

4588

memcg_oom_recover(memcg);

4583

memcg_oom_recover(memcg);

4589

4584

4590

return ret;

4585

return ret;

4591

}

4586

}

4592

4587

4593

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

4588

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

4594

unsigned long long val)

4589

unsigned long long val)

4595

{

4590

{

4596

int retry_count;

4591

int retry_count;

4597

u64 memlimit, memswlimit, oldusage, curusage;

4592

u64 memlimit, memswlimit, oldusage, curusage;

4598

int children = mem_cgroup_count_children(memcg);

4593

int children = mem_cgroup_count_children(memcg);

4599

int ret = -EBUSY;

4594

int ret = -EBUSY;

4600

int enlarge = 0;

4595

int enlarge = 0;

4601

4596

4602

/* see mem_cgroup_resize_res_limit */

4597

/* see mem_cgroup_resize_res_limit */

4603

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

4598

retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;

4604

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

4599

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

4605

while (retry_count) {

4600

while (retry_count) {

4606

if (signal_pending(current)) {

4601

if (signal_pending(current)) {

4607

ret = -EINTR;

4602

ret = -EINTR;

4608

break;

4603

break;

4609

}

4604

}

4610

/*

4605

/*

4611

* Rather than hide all in some function, I do this in

4606

* Rather than hide all in some function, I do this in

4612

* open coded manner. You see what this really does.

4607

* open coded manner. You see what this really does.

4613

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

4608

* We have to guarantee memcg->res.limit <= memcg->memsw.limit.

4614

*/

4609

*/

4615

mutex_lock(&set_limit_mutex);

4610

mutex_lock(&set_limit_mutex);

4616

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

4611

memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);

4617

if (memlimit > val) {

4612

if (memlimit > val) {

4618

ret = -EINVAL;

4613

ret = -EINVAL;

4619

mutex_unlock(&set_limit_mutex);

4614

mutex_unlock(&set_limit_mutex);

4620

break;

4615

break;

4621

}

4616

}

4622

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4617

memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

4623

if (memswlimit < val)

4618

if (memswlimit < val)

4624

enlarge = 1;

4619

enlarge = 1;

4625

ret = res_counter_set_limit(&memcg->memsw, val);

4620

ret = res_counter_set_limit(&memcg->memsw, val);

4626

if (!ret) {

4621

if (!ret) {

4627

if (memlimit == val)

4622

if (memlimit == val)

4628

memcg->memsw_is_minimum = true;

4623

memcg->memsw_is_minimum = true;

4629

else

4624

else

4630

memcg->memsw_is_minimum = false;

4625

memcg->memsw_is_minimum = false;

4631

}

4626

}

4632

mutex_unlock(&set_limit_mutex);

4627

mutex_unlock(&set_limit_mutex);

4633

4628

4634

if (!ret)

4629

if (!ret)

4635

break;

4630

break;

4636

4631

4637

mem_cgroup_reclaim(memcg, GFP_KERNEL,

4632

mem_cgroup_reclaim(memcg, GFP_KERNEL,

4638

MEM_CGROUP_RECLAIM_NOSWAP |

4633

MEM_CGROUP_RECLAIM_NOSWAP |

4639

MEM_CGROUP_RECLAIM_SHRINK);

4634

MEM_CGROUP_RECLAIM_SHRINK);

4640

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

4635

curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);

4641

/* Usage is reduced ? */

4636

/* Usage is reduced ? */

4642

if (curusage >= oldusage)

4637

if (curusage >= oldusage)

4643

retry_count--;

4638

retry_count--;

4644

else

4639

else

4645

oldusage = curusage;

4640

oldusage = curusage;

4646

}

4641

}

4647

if (!ret && enlarge)

4642

if (!ret && enlarge)

4648

memcg_oom_recover(memcg);

4643

memcg_oom_recover(memcg);

4649

return ret;

4644

return ret;

4650

}

4645

}

4651

4646

4652

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

4647

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

4653

gfp_t gfp_mask,

4648

gfp_t gfp_mask,

4654

unsigned long *total_scanned)

4649

unsigned long *total_scanned)

4655

{

4650

{

4656

unsigned long nr_reclaimed = 0;

4651

unsigned long nr_reclaimed = 0;

4657

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

4652

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

4658

unsigned long reclaimed;

4653

unsigned long reclaimed;

4659

int loop = 0;

4654

int loop = 0;

4660

struct mem_cgroup_tree_per_zone *mctz;

4655

struct mem_cgroup_tree_per_zone *mctz;

4661

unsigned long long excess;

4656

unsigned long long excess;

4662

unsigned long nr_scanned;

4657

unsigned long nr_scanned;

4663

4658

4664

if (order > 0)

4659

if (order > 0)

4665

return 0;

4660

return 0;

4666

4661

4667

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

4662

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

4668

/*

4663

/*

4669

* This loop can run a while, specially if mem_cgroup's continuously

4664

* This loop can run a while, specially if mem_cgroup's continuously

4670

* keep exceeding their soft limit and putting the system under

4665

* keep exceeding their soft limit and putting the system under

4671

* pressure

4666

* pressure

4672

*/

4667

*/

4673

do {

4668

do {

4674

if (next_mz)

4669

if (next_mz)

4675

mz = next_mz;

4670

mz = next_mz;

4676

else

4671

else

4677

mz = mem_cgroup_largest_soft_limit_node(mctz);

4672

mz = mem_cgroup_largest_soft_limit_node(mctz);

4678

if (!mz)

4673

if (!mz)

4679

break;

4674

break;

4680

4675

4681

nr_scanned = 0;

4676

nr_scanned = 0;

4682

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

4677

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

4683

gfp_mask, &nr_scanned);

4678

gfp_mask, &nr_scanned);

4684

nr_reclaimed += reclaimed;

4679

nr_reclaimed += reclaimed;

4685

*total_scanned += nr_scanned;

4680

*total_scanned += nr_scanned;

4686

spin_lock(&mctz->lock);

4681

spin_lock(&mctz->lock);

4687

4682

4688

/*

4683

/*

4689

* If we failed to reclaim anything from this memory cgroup

4684

* If we failed to reclaim anything from this memory cgroup

4690

* it is time to move on to the next cgroup

4685

* it is time to move on to the next cgroup

4691

*/

4686

*/

4692

next_mz = NULL;

4687

next_mz = NULL;

4693

if (!reclaimed) {

4688

if (!reclaimed) {

4694

do {

4689

do {

4695

/*

4690

/*

4696

* Loop until we find yet another one.

4691

* Loop until we find yet another one.

4697

*

4692

*

4698

* By the time we get the soft_limit lock

4693

* By the time we get the soft_limit lock

4699

* again, someone might have aded the

4694

* again, someone might have aded the

4700

* group back on the RB tree. Iterate to

4695

* group back on the RB tree. Iterate to

4701

* make sure we get a different mem.

4696

* make sure we get a different mem.

4702

* mem_cgroup_largest_soft_limit_node returns

4697

* mem_cgroup_largest_soft_limit_node returns

4703

* NULL if no other cgroup is present on

4698

* NULL if no other cgroup is present on

4704

* the tree

4699

* the tree

4705

*/

4700

*/

4706

next_mz =

4701

next_mz =

4707

__mem_cgroup_largest_soft_limit_node(mctz);

4702

__mem_cgroup_largest_soft_limit_node(mctz);

4708

if (next_mz == mz)

4703

if (next_mz == mz)

4709

css_put(&next_mz->memcg->css);

4704

css_put(&next_mz->memcg->css);

4710

else /* next_mz == NULL or other memcg */

4705

else /* next_mz == NULL or other memcg */

4711

break;

4706

break;

4712

} while (1);

4707

} while (1);

4713

}

4708

}

4714

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

4709

__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);

4715

excess = res_counter_soft_limit_excess(&mz->memcg->res);

4710

excess = res_counter_soft_limit_excess(&mz->memcg->res);

4716

/*

4711

/*

4717

* One school of thought says that we should not add

4712

* One school of thought says that we should not add

4718

* back the node to the tree if reclaim returns 0.

4713

* back the node to the tree if reclaim returns 0.

4719

* But our reclaim could return 0, simply because due

4714

* But our reclaim could return 0, simply because due

4720

* to priority we are exposing a smaller subset of

4715

* to priority we are exposing a smaller subset of

4721

* memory to reclaim from. Consider this as a longer

4716

* memory to reclaim from. Consider this as a longer

4722

* term TODO.

4717

* term TODO.

4723

*/

4718

*/

4724

/* If excess == 0, no tree ops */

4719

/* If excess == 0, no tree ops */

4725

__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);

4720

__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);

4726

spin_unlock(&mctz->lock);

4721

spin_unlock(&mctz->lock);

4727

css_put(&mz->memcg->css);

4722

css_put(&mz->memcg->css);

4728

loop++;

4723

loop++;

4729

/*

4724

/*

4730

* Could not reclaim anything and there are no more

4725

* Could not reclaim anything and there are no more

4731

* mem cgroups to try or we seem to be looping without

4726

* mem cgroups to try or we seem to be looping without

4732

* reclaiming anything.

4727

* reclaiming anything.

4733

*/

4728

*/

4734

if (!nr_reclaimed &&

4729

if (!nr_reclaimed &&

4735

(next_mz == NULL ||

4730

(next_mz == NULL ||

4736

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

4731

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

4737

break;

4732

break;

4738

} while (!nr_reclaimed);

4733

} while (!nr_reclaimed);

4739

if (next_mz)

4734

if (next_mz)

4740

css_put(&next_mz->memcg->css);

4735

css_put(&next_mz->memcg->css);

4741

return nr_reclaimed;

4736

return nr_reclaimed;

4742

}

4737

}

4743

4738

4744

/**

4739

/**

4745

* mem_cgroup_force_empty_list - clears LRU of a group

4740

* mem_cgroup_force_empty_list - clears LRU of a group

4746

* @memcg: group to clear

4741

* @memcg: group to clear

4747

* @node: NUMA node

4742

* @node: NUMA node

4748

* @zid: zone id

4743

* @zid: zone id

4749

* @lru: lru to to clear

4744

* @lru: lru to to clear

4750

*

4745

*

4751

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

4746

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

4752

* reclaim the pages page themselves - pages are moved to the parent (or root)

4747

* reclaim the pages page themselves - pages are moved to the parent (or root)

4753

* group.

4748

* group.

4754

*/

4749

*/

4755

static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

4750

static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

4756

int node, int zid, enum lru_list lru)

4751

int node, int zid, enum lru_list lru)

4757

{

4752

{

4758

struct lruvec *lruvec;

4753

struct lruvec *lruvec;

4759

unsigned long flags;

4754

unsigned long flags;

4760

struct list_head *list;

4755

struct list_head *list;

4761

struct page *busy;

4756

struct page *busy;

4762

struct zone *zone;

4757

struct zone *zone;

4763

4758

4764

zone = &NODE_DATA(node)->node_zones[zid];

4759

zone = &NODE_DATA(node)->node_zones[zid];

4765

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

4760

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

4766

list = &lruvec->lists[lru];

4761

list = &lruvec->lists[lru];

4767

4762

4768

busy = NULL;

4763

busy = NULL;

4769

do {

4764

do {

4770

struct page_cgroup *pc;

4765

struct page_cgroup *pc;

4771

struct page *page;

4766

struct page *page;

4772

4767

4773

spin_lock_irqsave(&zone->lru_lock, flags);

4768

spin_lock_irqsave(&zone->lru_lock, flags);

4774

if (list_empty(list)) {

4769

if (list_empty(list)) {

4775

spin_unlock_irqrestore(&zone->lru_lock, flags);

4770

spin_unlock_irqrestore(&zone->lru_lock, flags);

4776

break;

4771

break;

4777

}

4772

}

4778

page = list_entry(list->prev, struct page, lru);

4773

page = list_entry(list->prev, struct page, lru);

4779

if (busy == page) {

4774

if (busy == page) {

4780

list_move(&page->lru, list);

4775

list_move(&page->lru, list);

4781

busy = NULL;

4776

busy = NULL;

4782

spin_unlock_irqrestore(&zone->lru_lock, flags);

4777

spin_unlock_irqrestore(&zone->lru_lock, flags);

4783

continue;

4778

continue;

4784

}

4779

}

4785

spin_unlock_irqrestore(&zone->lru_lock, flags);

4780

spin_unlock_irqrestore(&zone->lru_lock, flags);

4786

4781

4787

pc = lookup_page_cgroup(page);

4782

pc = lookup_page_cgroup(page);

4788

4783

4789

if (mem_cgroup_move_parent(page, pc, memcg)) {

4784

if (mem_cgroup_move_parent(page, pc, memcg)) {

4790

/* found lock contention or "pc" is obsolete. */

4785

/* found lock contention or "pc" is obsolete. */

4791

busy = page;

4786

busy = page;

4792

cond_resched();

4787

cond_resched();

4793

} else

4788

} else

4794

busy = NULL;

4789

busy = NULL;

4795

} while (!list_empty(list));

4790

} while (!list_empty(list));

4796

}

4791

}

4797

4792

4798

/*

4793

/*

4799

* make mem_cgroup's charge to be 0 if there is no task by moving

4794

* make mem_cgroup's charge to be 0 if there is no task by moving

4800

* all the charges and pages to the parent.

4795

* all the charges and pages to the parent.

4801

* This enables deleting this mem_cgroup.

4796

* This enables deleting this mem_cgroup.

4802

*

4797

*

4803

* Caller is responsible for holding css reference on the memcg.

4798

* Caller is responsible for holding css reference on the memcg.

4804

*/

4799

*/

4805

static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)

4800

static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)

4806

{

4801

{

4807

int node, zid;

4802

int node, zid;

4808

u64 usage;

4803

u64 usage;

4809

4804

4810

do {

4805

do {

4811

/* This is for making all *used* pages to be on LRU. */

4806

/* This is for making all *used* pages to be on LRU. */

4812

lru_add_drain_all();

4807

lru_add_drain_all();

4813

drain_all_stock_sync(memcg);

4808

drain_all_stock_sync(memcg);

4814

mem_cgroup_start_move(memcg);

4809

mem_cgroup_start_move(memcg);

4815

for_each_node_state(node, N_MEMORY) {

4810

for_each_node_state(node, N_MEMORY) {

4816

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4811

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4817

enum lru_list lru;

4812

enum lru_list lru;

4818

for_each_lru(lru) {

4813

for_each_lru(lru) {

4819

mem_cgroup_force_empty_list(memcg,

4814

mem_cgroup_force_empty_list(memcg,

4820

node, zid, lru);

4815

node, zid, lru);

4821

}

4816

}

4822

}

4817

}

4823

}

4818

}

4824

mem_cgroup_end_move(memcg);

4819

mem_cgroup_end_move(memcg);

4825

memcg_oom_recover(memcg);

4820

memcg_oom_recover(memcg);

4826

cond_resched();

4821

cond_resched();

4827

4822

4828

/*

4823

/*

4829

* Kernel memory may not necessarily be trackable to a specific

4824

* Kernel memory may not necessarily be trackable to a specific

4830

* process. So they are not migrated, and therefore we can't

4825

* process. So they are not migrated, and therefore we can't

4831

* expect their value to drop to 0 here.

4826

* expect their value to drop to 0 here.

4832

* Having res filled up with kmem only is enough.

4827

* Having res filled up with kmem only is enough.

4833

*

4828

*

4834

* This is a safety check because mem_cgroup_force_empty_list

4829

* This is a safety check because mem_cgroup_force_empty_list

4835

* could have raced with mem_cgroup_replace_page_cache callers

4830

* could have raced with mem_cgroup_replace_page_cache callers

4836

* so the lru seemed empty but the page could have been added

4831

* so the lru seemed empty but the page could have been added

4837

* right after the check. RES_USAGE should be safe as we always

4832

* right after the check. RES_USAGE should be safe as we always

4838

* charge before adding to the LRU.

4833

* charge before adding to the LRU.

4839

*/

4834

*/

4840

usage = res_counter_read_u64(&memcg->res, RES_USAGE) -

4835

usage = res_counter_read_u64(&memcg->res, RES_USAGE) -

4841

res_counter_read_u64(&memcg->kmem, RES_USAGE);

4836

res_counter_read_u64(&memcg->kmem, RES_USAGE);

4842

} while (usage > 0);

4837

} while (usage > 0);

4843

}

4838

}

4844

4839

4845

static inline bool memcg_has_children(struct mem_cgroup *memcg)

4840

static inline bool memcg_has_children(struct mem_cgroup *memcg)

4846

{

4841

{

4847

lockdep_assert_held(&memcg_create_mutex);

4842

lockdep_assert_held(&memcg_create_mutex);

4848

/*

4843

/*

4849

* The lock does not prevent addition or deletion to the list

4844

* The lock does not prevent addition or deletion to the list

4850

* of children, but it prevents a new child from being

4845

* of children, but it prevents a new child from being

4851

* initialized based on this parent in css_online(), so it's

4846

* initialized based on this parent in css_online(), so it's

4852

* enough to decide whether hierarchically inherited

4847

* enough to decide whether hierarchically inherited

4853

* attributes can still be changed or not.

4848

* attributes can still be changed or not.

4854

*/

4849

*/

4855

return memcg->use_hierarchy &&

4850

return memcg->use_hierarchy &&

4856

!list_empty(&memcg->css.cgroup->children);

4851

!list_empty(&memcg->css.cgroup->children);

4857

}

4852

}

4858

4853

4859

/*

4854

/*

4860

* Reclaims as many pages from the given memcg as possible and moves

4855

* Reclaims as many pages from the given memcg as possible and moves

4861

* the rest to the parent.

4856

* the rest to the parent.

4862

*

4857

*

4863

* Caller is responsible for holding css reference for memcg.

4858

* Caller is responsible for holding css reference for memcg.

4864

*/

4859

*/

4865

static int mem_cgroup_force_empty(struct mem_cgroup *memcg)

4860

static int mem_cgroup_force_empty(struct mem_cgroup *memcg)

4866

{

4861

{

4867

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

4862

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

4868

struct cgroup *cgrp = memcg->css.cgroup;

4863

struct cgroup *cgrp = memcg->css.cgroup;

4869

4864

4870

/* returns EBUSY if there is a task or if we come here twice. */

4865

/* returns EBUSY if there is a task or if we come here twice. */

4871

if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))

4866

if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))

4872

return -EBUSY;

4867

return -EBUSY;

4873

4868

4874

/* we call try-to-free pages for make this cgroup empty */

4869

/* we call try-to-free pages for make this cgroup empty */

4875

lru_add_drain_all();

4870

lru_add_drain_all();

4876

/* try to free all pages in this cgroup */

4871

/* try to free all pages in this cgroup */

4877

while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {

4872

while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {

4878

int progress;

4873

int progress;

4879

4874

4880

if (signal_pending(current))

4875

if (signal_pending(current))

4881

return -EINTR;

4876

return -EINTR;

4882

4877

4883

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,

4878

progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,

4884

false);

4879

false);

4885

if (!progress) {

4880

if (!progress) {

4886

nr_retries--;

4881

nr_retries--;

4887

/* maybe some writeback is necessary */

4882

/* maybe some writeback is necessary */

4888

congestion_wait(BLK_RW_ASYNC, HZ/10);

4883

congestion_wait(BLK_RW_ASYNC, HZ/10);

4889

}

4884

}

4890

4885

4891

}

4886

}

4892

lru_add_drain();

4887

lru_add_drain();

4893

mem_cgroup_reparent_charges(memcg);

4888

mem_cgroup_reparent_charges(memcg);

4894

4889

4895

return 0;

4890

return 0;

4896

}

4891

}

4897

4892

4898

static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,

4893

static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,

4899

unsigned int event)

4894

unsigned int event)

4900

{

4895

{

4901

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4896

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4902

4897

4903

if (mem_cgroup_is_root(memcg))

4898

if (mem_cgroup_is_root(memcg))

4904

return -EINVAL;

4899

return -EINVAL;

4905

return mem_cgroup_force_empty(memcg);

4900

return mem_cgroup_force_empty(memcg);

4906

}

4901

}

4907

4902

4908

static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,

4903

static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,

4909

struct cftype *cft)

4904

struct cftype *cft)

4910

{

4905

{

4911

return mem_cgroup_from_css(css)->use_hierarchy;

4906

return mem_cgroup_from_css(css)->use_hierarchy;

4912

}

4907

}

4913

4908

4914

static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,

4909

static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,

4915

struct cftype *cft, u64 val)

4910

struct cftype *cft, u64 val)

4916

{

4911

{

4917

int retval = 0;

4912

int retval = 0;

4918

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4913

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4919

struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));

4914

struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));

4920

4915

4921

mutex_lock(&memcg_create_mutex);

4916

mutex_lock(&memcg_create_mutex);

4922

4917

4923

if (memcg->use_hierarchy == val)

4918

if (memcg->use_hierarchy == val)

4924

goto out;

4919

goto out;

4925

4920

4926

/*

4921

/*

4927

* If parent's use_hierarchy is set, we can't make any modifications

4922

* If parent's use_hierarchy is set, we can't make any modifications

4928

* in the child subtrees. If it is unset, then the change can

4923

* in the child subtrees. If it is unset, then the change can

4929

* occur, provided the current cgroup has no children.

4924

* occur, provided the current cgroup has no children.

4930

*

4925

*

4931

* For the root cgroup, parent_mem is NULL, we allow value to be

4926

* For the root cgroup, parent_mem is NULL, we allow value to be

4932

* set if there are no children.

4927

* set if there are no children.

4933

*/

4928

*/

4934

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

4929

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

4935

(val == 1 || val == 0)) {

4930

(val == 1 || val == 0)) {

4936

if (list_empty(&memcg->css.cgroup->children))

4931

if (list_empty(&memcg->css.cgroup->children))

4937

memcg->use_hierarchy = val;

4932

memcg->use_hierarchy = val;

4938

else

4933

else

4939

retval = -EBUSY;

4934

retval = -EBUSY;

4940

} else

4935

} else

4941

retval = -EINVAL;

4936

retval = -EINVAL;

4942

4937

4943

out:

4938

out:

4944

mutex_unlock(&memcg_create_mutex);

4939

mutex_unlock(&memcg_create_mutex);

4945

4940

4946

return retval;

4941

return retval;

4947

}

4942

}

4948

4943

4949

4944

4950

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,

4945

static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,

4951

enum mem_cgroup_stat_index idx)

4946

enum mem_cgroup_stat_index idx)

4952

{

4947

{

4953

struct mem_cgroup *iter;

4948

struct mem_cgroup *iter;

4954

long val = 0;

4949

long val = 0;

4955

4950

4956

/* Per-cpu values can be negative, use a signed accumulator */

4951

/* Per-cpu values can be negative, use a signed accumulator */

4957

for_each_mem_cgroup_tree(iter, memcg)

4952

for_each_mem_cgroup_tree(iter, memcg)

4958

val += mem_cgroup_read_stat(iter, idx);

4953

val += mem_cgroup_read_stat(iter, idx);

4959

4954

4960

if (val < 0) /* race ? */

4955

if (val < 0) /* race ? */

4961

val = 0;

4956

val = 0;

4962

return val;

4957

return val;

4963

}

4958

}

4964

4959

4965

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

4960

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

4966

{

4961

{

4967

u64 val;

4962

u64 val;

4968

4963

4969

if (!mem_cgroup_is_root(memcg)) {

4964

if (!mem_cgroup_is_root(memcg)) {

4970

if (!swap)

4965

if (!swap)

4971

return res_counter_read_u64(&memcg->res, RES_USAGE);

4966

return res_counter_read_u64(&memcg->res, RES_USAGE);

4972

else

4967

else

4973

return res_counter_read_u64(&memcg->memsw, RES_USAGE);

4968

return res_counter_read_u64(&memcg->memsw, RES_USAGE);

4974

}

4969

}

4975

4970

4976

/*

4971

/*

4977

* Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS

4972

* Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS

4978

* as well as in MEM_CGROUP_STAT_RSS_HUGE.

4973

* as well as in MEM_CGROUP_STAT_RSS_HUGE.

4979

*/

4974

*/

4980

val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);

4975

val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);

4981

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

4976

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);

4982

4977

4983

if (swap)

4978

if (swap)

4984

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

4979

val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);

4985

4980

4986

return val << PAGE_SHIFT;

4981

return val << PAGE_SHIFT;

4987

}

4982

}

4988

4983

4989

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,

4984

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,

4990

struct cftype *cft)

4985

struct cftype *cft)

4991

{

4986

{

4992

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4987

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4993

u64 val;

4988

u64 val;

4994

int name;

4989

int name;

4995

enum res_type type;

4990

enum res_type type;

4996

4991

4997

type = MEMFILE_TYPE(cft->private);

4992

type = MEMFILE_TYPE(cft->private);

4998

name = MEMFILE_ATTR(cft->private);

4993

name = MEMFILE_ATTR(cft->private);

4999

4994

5000

switch (type) {

4995

switch (type) {

5001

case _MEM:

4996

case _MEM:

5002

if (name == RES_USAGE)

4997

if (name == RES_USAGE)

5003

val = mem_cgroup_usage(memcg, false);

4998

val = mem_cgroup_usage(memcg, false);

5004

else

4999

else

5005

val = res_counter_read_u64(&memcg->res, name);

5000

val = res_counter_read_u64(&memcg->res, name);

5006

break;

5001

break;

5007

case _MEMSWAP:

5002

case _MEMSWAP:

5008

if (name == RES_USAGE)

5003

if (name == RES_USAGE)

5009

val = mem_cgroup_usage(memcg, true);

5004

val = mem_cgroup_usage(memcg, true);

5010

else

5005

else

5011

val = res_counter_read_u64(&memcg->memsw, name);

5006

val = res_counter_read_u64(&memcg->memsw, name);

5012

break;

5007

break;

5013

case _KMEM:

5008

case _KMEM:

5014

val = res_counter_read_u64(&memcg->kmem, name);

5009

val = res_counter_read_u64(&memcg->kmem, name);

5015

break;

5010

break;

5016

default:

5011

default:

5017

BUG();

5012

BUG();

5018

}

5013

}

5019

5014

5020

return val;

5015

return val;

5021

}

5016

}

5022

5017

5023

#ifdef CONFIG_MEMCG_KMEM

5018

#ifdef CONFIG_MEMCG_KMEM

5024

/* should be called with activate_kmem_mutex held */

5019

/* should be called with activate_kmem_mutex held */

5025

static int __memcg_activate_kmem(struct mem_cgroup *memcg,

5020

static int __memcg_activate_kmem(struct mem_cgroup *memcg,

5026

unsigned long long limit)

5021

unsigned long long limit)

5027

{

5022

{

5028

int err = 0;

5023

int err = 0;

5029

int memcg_id;

5024

int memcg_id;

5030

5025

5031

if (memcg_kmem_is_active(memcg))

5026

if (memcg_kmem_is_active(memcg))

5032

return 0;

5027

return 0;

5033

5028

5034

/*

5029

/*

5035

* We are going to allocate memory for data shared by all memory

5030

* We are going to allocate memory for data shared by all memory

5036

* cgroups so let's stop accounting here.

5031

* cgroups so let's stop accounting here.

5037

*/

5032

*/

5038

memcg_stop_kmem_account();

5033

memcg_stop_kmem_account();

5039

5034

5040

/*

5035

/*

5041

* For simplicity, we won't allow this to be disabled. It also can't

5036

* For simplicity, we won't allow this to be disabled. It also can't

5042

* be changed if the cgroup has children already, or if tasks had

5037

* be changed if the cgroup has children already, or if tasks had

5043

* already joined.

5038

* already joined.

5044

*

5039

*

5045

* If tasks join before we set the limit, a person looking at

5040

* If tasks join before we set the limit, a person looking at

5046

* kmem.usage_in_bytes will have no way to determine when it took

5041

* kmem.usage_in_bytes will have no way to determine when it took

5047

* place, which makes the value quite meaningless.

5042

* place, which makes the value quite meaningless.

5048

*

5043

*

5049

* After it first became limited, changes in the value of the limit are

5044

* After it first became limited, changes in the value of the limit are

5050

* of course permitted.

5045

* of course permitted.

5051

*/

5046

*/

5052

mutex_lock(&memcg_create_mutex);

5047

mutex_lock(&memcg_create_mutex);

5053

if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))

5048

if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))

5054

err = -EBUSY;

5049

err = -EBUSY;

5055

mutex_unlock(&memcg_create_mutex);

5050

mutex_unlock(&memcg_create_mutex);

5056

if (err)

5051

if (err)

5057

goto out;

5052

goto out;

5058

5053

5059

memcg_id = ida_simple_get(&kmem_limited_groups,

5054

memcg_id = ida_simple_get(&kmem_limited_groups,

5060

0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);

5055

0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);

5061

if (memcg_id < 0) {

5056

if (memcg_id < 0) {

5062

err = memcg_id;

5057

err = memcg_id;

5063

goto out;

5058

goto out;

5064

}

5059

}

5065

5060

5066

/*

5061

/*

5067

* Make sure we have enough space for this cgroup in each root cache's

5062

* Make sure we have enough space for this cgroup in each root cache's

5068

* memcg_params.

5063

* memcg_params.

5069

*/

5064

*/

5070

err = memcg_update_all_caches(memcg_id + 1);

5065

err = memcg_update_all_caches(memcg_id + 1);

5071

if (err)

5066

if (err)

5072

goto out_rmid;

5067

goto out_rmid;

5073

5068

5074

memcg->kmemcg_id = memcg_id;

5069

memcg->kmemcg_id = memcg_id;

5075

INIT_LIST_HEAD(&memcg->memcg_slab_caches);

5070

INIT_LIST_HEAD(&memcg->memcg_slab_caches);

5076

mutex_init(&memcg->slab_caches_mutex);

5071

mutex_init(&memcg->slab_caches_mutex);

5077

5072

5078

/*

5073

/*

5079

* We couldn't have accounted to this cgroup, because it hasn't got the

5074

* We couldn't have accounted to this cgroup, because it hasn't got the

5080

* active bit set yet, so this should succeed.

5075

* active bit set yet, so this should succeed.

5081

*/

5076

*/

5082

err = res_counter_set_limit(&memcg->kmem, limit);

5077

err = res_counter_set_limit(&memcg->kmem, limit);

5083

VM_BUG_ON(err);

5078

VM_BUG_ON(err);

5084

5079

5085

static_key_slow_inc(&memcg_kmem_enabled_key);

5080

static_key_slow_inc(&memcg_kmem_enabled_key);

5086

/*

5081

/*

5087

* Setting the active bit after enabling static branching will

5082

* Setting the active bit after enabling static branching will

5088

* guarantee no one starts accounting before all call sites are

5083

* guarantee no one starts accounting before all call sites are

5089

* patched.

5084

* patched.

5090

*/

5085

*/

5091

memcg_kmem_set_active(memcg);

5086

memcg_kmem_set_active(memcg);

5092

out:

5087

out:

5093

memcg_resume_kmem_account();

5088

memcg_resume_kmem_account();

5094

return err;

5089

return err;

5095

5090

5096

out_rmid:

5091

out_rmid:

5097

ida_simple_remove(&kmem_limited_groups, memcg_id);

5092

ida_simple_remove(&kmem_limited_groups, memcg_id);

5098

goto out;

5093

goto out;

5099

}

5094

}

5100

5095

5101

static int memcg_activate_kmem(struct mem_cgroup *memcg,

5096

static int memcg_activate_kmem(struct mem_cgroup *memcg,

5102

unsigned long long limit)

5097

unsigned long long limit)

5103

{

5098

{

5104

int ret;

5099

int ret;

5105

5100

5106

mutex_lock(&activate_kmem_mutex);

5101

mutex_lock(&activate_kmem_mutex);

5107

ret = __memcg_activate_kmem(memcg, limit);

5102

ret = __memcg_activate_kmem(memcg, limit);

5108

mutex_unlock(&activate_kmem_mutex);

5103

mutex_unlock(&activate_kmem_mutex);

5109

return ret;

5104

return ret;

5110

}

5105

}

5111

5106

5112

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

5107

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

5113

unsigned long long val)

5108

unsigned long long val)

5114

{

5109

{

5115

int ret;

5110

int ret;

5116

5111

5117

if (!memcg_kmem_is_active(memcg))

5112

if (!memcg_kmem_is_active(memcg))

5118

ret = memcg_activate_kmem(memcg, val);

5113

ret = memcg_activate_kmem(memcg, val);

5119

else

5114

else

5120

ret = res_counter_set_limit(&memcg->kmem, val);

5115

ret = res_counter_set_limit(&memcg->kmem, val);

5121

return ret;

5116

return ret;

5122

}

5117

}

5123

5118

5124

static int memcg_propagate_kmem(struct mem_cgroup *memcg)

5119

static int memcg_propagate_kmem(struct mem_cgroup *memcg)

5125

{

5120

{

5126

int ret = 0;

5121

int ret = 0;

5127

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

5122

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

5128

5123

5129

if (!parent)

5124

if (!parent)

5130

return 0;

5125

return 0;

5131

5126

5132

mutex_lock(&activate_kmem_mutex);

5127

mutex_lock(&activate_kmem_mutex);

5133

/*

5128

/*

5134

* If the parent cgroup is not kmem-active now, it cannot be activated

5129

* If the parent cgroup is not kmem-active now, it cannot be activated

5135

* after this point, because it has at least one child already.

5130

* after this point, because it has at least one child already.

5136

*/

5131

*/

5137

if (memcg_kmem_is_active(parent))

5132

if (memcg_kmem_is_active(parent))

5138

ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);

5133

ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);

5139

mutex_unlock(&activate_kmem_mutex);

5134

mutex_unlock(&activate_kmem_mutex);

5140

return ret;

5135

return ret;

5141

}

5136

}

5142

#else

5137

#else

5143

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

5138

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

5144

unsigned long long val)

5139

unsigned long long val)

5145

{

5140

{

5146

return -EINVAL;

5141

return -EINVAL;

5147

}

5142

}

5148

#endif /* CONFIG_MEMCG_KMEM */

5143

#endif /* CONFIG_MEMCG_KMEM */

5149

5144

5150

/*

5145

/*

5151

* The user of this function is...

5146

* The user of this function is...

5152

* RES_LIMIT.

5147

* RES_LIMIT.

5153

*/

5148

*/

5154

static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,

5149

static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,

5155

char *buffer)

5150

char *buffer)

5156

{

5151

{

5157

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5152

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5158

enum res_type type;

5153

enum res_type type;

5159

int name;

5154

int name;

5160

unsigned long long val;

5155

unsigned long long val;

5161

int ret;

5156

int ret;

5162

5157

5163

type = MEMFILE_TYPE(cft->private);

5158

type = MEMFILE_TYPE(cft->private);

5164

name = MEMFILE_ATTR(cft->private);

5159

name = MEMFILE_ATTR(cft->private);

5165

5160

5166

switch (name) {

5161

switch (name) {

5167

case RES_LIMIT:

5162

case RES_LIMIT:

5168

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

5163

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

5169

ret = -EINVAL;

5164

ret = -EINVAL;

5170

break;

5165

break;

5171

}

5166

}

5172

/* This function does all necessary parse...reuse it */

5167

/* This function does all necessary parse...reuse it */

5173

ret = res_counter_memparse_write_strategy(buffer, &val);

5168

ret = res_counter_memparse_write_strategy(buffer, &val);

5174

if (ret)

5169

if (ret)

5175

break;

5170

break;

5176

if (type == _MEM)

5171

if (type == _MEM)

5177

ret = mem_cgroup_resize_limit(memcg, val);

5172

ret = mem_cgroup_resize_limit(memcg, val);

5178

else if (type == _MEMSWAP)

5173

else if (type == _MEMSWAP)

5179

ret = mem_cgroup_resize_memsw_limit(memcg, val);

5174

ret = mem_cgroup_resize_memsw_limit(memcg, val);

5180

else if (type == _KMEM)

5175

else if (type == _KMEM)

5181

ret = memcg_update_kmem_limit(memcg, val);

5176

ret = memcg_update_kmem_limit(memcg, val);

5182

else

5177

else

5183

return -EINVAL;

5178

return -EINVAL;

5184

break;

5179

break;

5185

case RES_SOFT_LIMIT:

5180

case RES_SOFT_LIMIT:

5186

ret = res_counter_memparse_write_strategy(buffer, &val);

5181

ret = res_counter_memparse_write_strategy(buffer, &val);

5187

if (ret)

5182

if (ret)

5188

break;

5183

break;

5189

/*

5184

/*

5190

* For memsw, soft limits are hard to implement in terms

5185

* For memsw, soft limits are hard to implement in terms

5191

* of semantics, for now, we support soft limits for

5186

* of semantics, for now, we support soft limits for

5192

* control without swap

5187

* control without swap

5193

*/

5188

*/

5194

if (type == _MEM)

5189

if (type == _MEM)

5195

ret = res_counter_set_soft_limit(&memcg->res, val);

5190

ret = res_counter_set_soft_limit(&memcg->res, val);

5196

else

5191

else

5197

ret = -EINVAL;

5192

ret = -EINVAL;

5198

break;

5193

break;

5199

default:

5194

default:

5200

ret = -EINVAL; /* should be BUG() ? */

5195

ret = -EINVAL; /* should be BUG() ? */

5201

break;

5196

break;

5202

}

5197

}

5203

return ret;

5198

return ret;

5204

}

5199

}

5205

5200

5206

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

5201

static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,

5207

unsigned long long *mem_limit, unsigned long long *memsw_limit)

5202

unsigned long long *mem_limit, unsigned long long *memsw_limit)

5208

{

5203

{

5209

unsigned long long min_limit, min_memsw_limit, tmp;

5204

unsigned long long min_limit, min_memsw_limit, tmp;

5210

5205

5211

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

5206

min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);

5212

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

5207

min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

5213

if (!memcg->use_hierarchy)

5208

if (!memcg->use_hierarchy)

5214

goto out;

5209

goto out;

5215

5210

5216

while (css_parent(&memcg->css)) {

5211

while (css_parent(&memcg->css)) {

5217

memcg = mem_cgroup_from_css(css_parent(&memcg->css));

5212

memcg = mem_cgroup_from_css(css_parent(&memcg->css));

5218

if (!memcg->use_hierarchy)

5213

if (!memcg->use_hierarchy)

5219

break;

5214

break;

5220

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

5215

tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);

5221

min_limit = min(min_limit, tmp);

5216

min_limit = min(min_limit, tmp);

5222

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

5217

tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);

5223

min_memsw_limit = min(min_memsw_limit, tmp);

5218

min_memsw_limit = min(min_memsw_limit, tmp);

5224

}

5219

}

5225

out:

5220

out:

5226

*mem_limit = min_limit;

5221

*mem_limit = min_limit;

5227

*memsw_limit = min_memsw_limit;

5222

*memsw_limit = min_memsw_limit;

5228

}

5223

}

5229

5224

5230

static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)

5225

static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)

5231

{

5226

{

5232

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5227

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5233

int name;

5228

int name;

5234

enum res_type type;

5229

enum res_type type;

5235

5230

5236

type = MEMFILE_TYPE(event);

5231

type = MEMFILE_TYPE(event);

5237

name = MEMFILE_ATTR(event);

5232

name = MEMFILE_ATTR(event);

5238

5233

5239

switch (name) {

5234

switch (name) {

5240

case RES_MAX_USAGE:

5235

case RES_MAX_USAGE:

5241

if (type == _MEM)

5236

if (type == _MEM)

5242

res_counter_reset_max(&memcg->res);

5237

res_counter_reset_max(&memcg->res);

5243

else if (type == _MEMSWAP)

5238

else if (type == _MEMSWAP)

5244

res_counter_reset_max(&memcg->memsw);

5239

res_counter_reset_max(&memcg->memsw);

5245

else if (type == _KMEM)

5240

else if (type == _KMEM)

5246

res_counter_reset_max(&memcg->kmem);

5241

res_counter_reset_max(&memcg->kmem);

5247

else

5242

else

5248

return -EINVAL;

5243

return -EINVAL;

5249

break;

5244

break;

5250

case RES_FAILCNT:

5245

case RES_FAILCNT:

5251

if (type == _MEM)

5246

if (type == _MEM)

5252

res_counter_reset_failcnt(&memcg->res);

5247

res_counter_reset_failcnt(&memcg->res);

5253

else if (type == _MEMSWAP)

5248

else if (type == _MEMSWAP)

5254

res_counter_reset_failcnt(&memcg->memsw);

5249

res_counter_reset_failcnt(&memcg->memsw);

5255

else if (type == _KMEM)

5250

else if (type == _KMEM)

5256

res_counter_reset_failcnt(&memcg->kmem);

5251

res_counter_reset_failcnt(&memcg->kmem);

5257

else

5252

else

5258

return -EINVAL;

5253

return -EINVAL;

5259

break;

5254

break;

5260

}

5255

}

5261

5256

5262

return 0;

5257

return 0;

5263

}

5258

}

5264

5259

5265

static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,

5260

static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,

5266

struct cftype *cft)

5261

struct cftype *cft)

5267

{

5262

{

5268

return mem_cgroup_from_css(css)->move_charge_at_immigrate;

5263

return mem_cgroup_from_css(css)->move_charge_at_immigrate;

5269

}

5264

}

5270

5265

5271

#ifdef CONFIG_MMU

5266

#ifdef CONFIG_MMU

5272

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

5267

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

5273

struct cftype *cft, u64 val)

5268

struct cftype *cft, u64 val)

5274

{

5269

{

5275

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5270

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5276

5271

5277

if (val >= (1 << NR_MOVE_TYPE))

5272

if (val >= (1 << NR_MOVE_TYPE))

5278

return -EINVAL;

5273

return -EINVAL;

5279

5274

5280

/*

5275

/*

5281

* No kind of locking is needed in here, because ->can_attach() will

5276

* No kind of locking is needed in here, because ->can_attach() will

5282

* check this value once in the beginning of the process, and then carry

5277

* check this value once in the beginning of the process, and then carry

5283

* on with stale data. This means that changes to this value will only

5278

* on with stale data. This means that changes to this value will only

5284

* affect task migrations starting after the change.

5279

* affect task migrations starting after the change.

5285

*/

5280

*/

5286

memcg->move_charge_at_immigrate = val;

5281

memcg->move_charge_at_immigrate = val;

5287

return 0;

5282

return 0;

5288

}

5283

}

5289

#else

5284

#else

5290

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

5285

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

5291

struct cftype *cft, u64 val)

5286

struct cftype *cft, u64 val)

5292

{

5287

{

5293

return -ENOSYS;

5288

return -ENOSYS;

5294

}

5289

}

5295

#endif

5290

#endif

5296

5291

5297

#ifdef CONFIG_NUMA

5292

#ifdef CONFIG_NUMA

5298

static int memcg_numa_stat_show(struct seq_file *m, void *v)

5293

static int memcg_numa_stat_show(struct seq_file *m, void *v)

5299

{

5294

{

5300

struct numa_stat {

5295

struct numa_stat {

5301

const char *name;

5296

const char *name;

5302

unsigned int lru_mask;

5297

unsigned int lru_mask;

5303

};

5298

};

5304

5299

5305

static const struct numa_stat stats[] = {

5300

static const struct numa_stat stats[] = {

5306

{ "total", LRU_ALL },

5301

{ "total", LRU_ALL },

5307

{ "file", LRU_ALL_FILE },

5302

{ "file", LRU_ALL_FILE },

5308

{ "anon", LRU_ALL_ANON },

5303

{ "anon", LRU_ALL_ANON },

5309

{ "unevictable", BIT(LRU_UNEVICTABLE) },

5304

{ "unevictable", BIT(LRU_UNEVICTABLE) },

5310

};

5305

};

5311

const struct numa_stat *stat;

5306

const struct numa_stat *stat;

5312

int nid;

5307

int nid;

5313

unsigned long nr;

5308

unsigned long nr;

5314

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

5309

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

5315

5310

5316

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

5311

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

5317

nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);

5312

nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);

5318

seq_printf(m, "%s=%lu", stat->name, nr);

5313

seq_printf(m, "%s=%lu", stat->name, nr);

5319

for_each_node_state(nid, N_MEMORY) {

5314

for_each_node_state(nid, N_MEMORY) {

5320

nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

5315

nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

5321

stat->lru_mask);

5316

stat->lru_mask);

5322

seq_printf(m, " N%d=%lu", nid, nr);

5317

seq_printf(m, " N%d=%lu", nid, nr);

5323

}

5318

}

5324

seq_putc(m, '\n');

5319

seq_putc(m, '\n');

5325

}

5320

}

5326

5321

5327

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

5322

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

5328

struct mem_cgroup *iter;

5323

struct mem_cgroup *iter;

5329

5324

5330

nr = 0;

5325

nr = 0;

5331

for_each_mem_cgroup_tree(iter, memcg)

5326

for_each_mem_cgroup_tree(iter, memcg)

5332

nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);

5327

nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);

5333

seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);

5328

seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);

5334

for_each_node_state(nid, N_MEMORY) {

5329

for_each_node_state(nid, N_MEMORY) {

5335

nr = 0;

5330

nr = 0;

5336

for_each_mem_cgroup_tree(iter, memcg)

5331

for_each_mem_cgroup_tree(iter, memcg)

5337

nr += mem_cgroup_node_nr_lru_pages(

5332

nr += mem_cgroup_node_nr_lru_pages(

5338

iter, nid, stat->lru_mask);

5333

iter, nid, stat->lru_mask);

5339

seq_printf(m, " N%d=%lu", nid, nr);

5334

seq_printf(m, " N%d=%lu", nid, nr);

5340

}

5335

}

5341

seq_putc(m, '\n');

5336

seq_putc(m, '\n');

5342

}

5337

}

5343

5338

5344

return 0;

5339

return 0;

5345

}

5340

}

5346

#endif /* CONFIG_NUMA */

5341

#endif /* CONFIG_NUMA */

5347

5342

5348

static inline void mem_cgroup_lru_names_not_uptodate(void)

5343

static inline void mem_cgroup_lru_names_not_uptodate(void)

5349

{

5344

{

5350

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

5345

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

5351

}

5346

}

5352

5347

5353

static int memcg_stat_show(struct seq_file *m, void *v)

5348

static int memcg_stat_show(struct seq_file *m, void *v)

5354

{

5349

{

5355

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

5350

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

5356

struct mem_cgroup *mi;

5351

struct mem_cgroup *mi;

5357

unsigned int i;

5352

unsigned int i;

5358

5353

5359

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

5354

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

5360

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

5355

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

5361

continue;

5356

continue;

5362

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

5357

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

5363

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

5358

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

5364

}

5359

}

5365

5360

5366

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

5361

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

5367

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

5362

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

5368

mem_cgroup_read_events(memcg, i));

5363

mem_cgroup_read_events(memcg, i));

5369

5364

5370

for (i = 0; i < NR_LRU_LISTS; i++)

5365

for (i = 0; i < NR_LRU_LISTS; i++)

5371

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

5366

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

5372

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

5367

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

5373

5368

5374

/* Hierarchical information */

5369

/* Hierarchical information */

5375

{

5370

{

5376

unsigned long long limit, memsw_limit;

5371

unsigned long long limit, memsw_limit;

5377

memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);

5372

memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);

5378

seq_printf(m, "hierarchical_memory_limit %llu\n", limit);

5373

seq_printf(m, "hierarchical_memory_limit %llu\n", limit);

5379

if (do_swap_account)

5374

if (do_swap_account)

5380

seq_printf(m, "hierarchical_memsw_limit %llu\n",

5375

seq_printf(m, "hierarchical_memsw_limit %llu\n",

5381

memsw_limit);

5376

memsw_limit);

5382

}

5377

}

5383

5378

5384

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

5379

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

5385

long long val = 0;

5380

long long val = 0;

5386

5381

5387

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

5382

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

5388

continue;

5383

continue;

5389

for_each_mem_cgroup_tree(mi, memcg)

5384

for_each_mem_cgroup_tree(mi, memcg)

5390

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

5385

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

5391

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

5386

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

5392

}

5387

}

5393

5388

5394

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

5389

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

5395

unsigned long long val = 0;

5390

unsigned long long val = 0;

5396

5391

5397

for_each_mem_cgroup_tree(mi, memcg)

5392

for_each_mem_cgroup_tree(mi, memcg)

5398

val += mem_cgroup_read_events(mi, i);

5393

val += mem_cgroup_read_events(mi, i);

5399

seq_printf(m, "total_%s %llu\n",

5394

seq_printf(m, "total_%s %llu\n",

5400

mem_cgroup_events_names[i], val);

5395

mem_cgroup_events_names[i], val);

5401

}

5396

}

5402

5397

5403

for (i = 0; i < NR_LRU_LISTS; i++) {

5398

for (i = 0; i < NR_LRU_LISTS; i++) {

5404

unsigned long long val = 0;

5399

unsigned long long val = 0;

5405

5400

5406

for_each_mem_cgroup_tree(mi, memcg)

5401

for_each_mem_cgroup_tree(mi, memcg)

5407

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

5402

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

5408

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

5403

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

5409

}

5404

}

5410

5405

5411

#ifdef CONFIG_DEBUG_VM

5406

#ifdef CONFIG_DEBUG_VM

5412

{

5407

{

5413

int nid, zid;

5408

int nid, zid;

5414

struct mem_cgroup_per_zone *mz;

5409

struct mem_cgroup_per_zone *mz;

5415

struct zone_reclaim_stat *rstat;

5410

struct zone_reclaim_stat *rstat;

5416

unsigned long recent_rotated[2] = {0, 0};

5411

unsigned long recent_rotated[2] = {0, 0};

5417

unsigned long recent_scanned[2] = {0, 0};

5412

unsigned long recent_scanned[2] = {0, 0};

5418

5413

5419

for_each_online_node(nid)

5414

for_each_online_node(nid)

5420

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

5415

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

5421

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

5416

mz = mem_cgroup_zoneinfo(memcg, nid, zid);

5422

rstat = &mz->lruvec.reclaim_stat;

5417

rstat = &mz->lruvec.reclaim_stat;

5423

5418

5424

recent_rotated[0] += rstat->recent_rotated[0];

5419

recent_rotated[0] += rstat->recent_rotated[0];

5425

recent_rotated[1] += rstat->recent_rotated[1];

5420

recent_rotated[1] += rstat->recent_rotated[1];

5426

recent_scanned[0] += rstat->recent_scanned[0];

5421

recent_scanned[0] += rstat->recent_scanned[0];

5427

recent_scanned[1] += rstat->recent_scanned[1];

5422

recent_scanned[1] += rstat->recent_scanned[1];

5428

}

5423

}

5429

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

5424

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

5430

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

5425

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

5431

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

5426

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

5432

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

5427

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

5433

}

5428

}

5434

#endif

5429

#endif

5435

5430

5436

return 0;

5431

return 0;

5437

}

5432

}

5438

5433

5439

static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,

5434

static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,

5440

struct cftype *cft)

5435

struct cftype *cft)

5441

{

5436

{

5442

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5437

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5443

5438

5444

return mem_cgroup_swappiness(memcg);

5439

return mem_cgroup_swappiness(memcg);

5445

}

5440

}

5446

5441

5447

static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,

5442

static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,

5448

struct cftype *cft, u64 val)

5443

struct cftype *cft, u64 val)

5449

{

5444

{

5450

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5445

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5451

struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));

5446

struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));

5452

5447

5453

if (val > 100 || !parent)

5448

if (val > 100 || !parent)

5454

return -EINVAL;

5449

return -EINVAL;

5455

5450

5456

mutex_lock(&memcg_create_mutex);

5451

mutex_lock(&memcg_create_mutex);

5457

5452

5458

/* If under hierarchy, only empty-root can set this value */

5453

/* If under hierarchy, only empty-root can set this value */

5459

if ((parent->use_hierarchy) || memcg_has_children(memcg)) {

5454

if ((parent->use_hierarchy) || memcg_has_children(memcg)) {

5460

mutex_unlock(&memcg_create_mutex);

5455

mutex_unlock(&memcg_create_mutex);

5461

return -EINVAL;

5456

return -EINVAL;

5462

}

5457

}

5463

5458

5464

memcg->swappiness = val;

5459

memcg->swappiness = val;

5465

5460

5466

mutex_unlock(&memcg_create_mutex);

5461

mutex_unlock(&memcg_create_mutex);

5467

5462

5468

return 0;

5463

return 0;

5469

}

5464

}

5470

5465

5471

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

5466

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

5472

{

5467

{

5473

struct mem_cgroup_threshold_ary *t;

5468

struct mem_cgroup_threshold_ary *t;

5474

u64 usage;

5469

u64 usage;

5475

int i;

5470

int i;

5476

5471

5477

rcu_read_lock();

5472

rcu_read_lock();

5478

if (!swap)

5473

if (!swap)

5479

t = rcu_dereference(memcg->thresholds.primary);

5474

t = rcu_dereference(memcg->thresholds.primary);

5480

else

5475

else

5481

t = rcu_dereference(memcg->memsw_thresholds.primary);

5476

t = rcu_dereference(memcg->memsw_thresholds.primary);

5482

5477

5483

if (!t)

5478

if (!t)

5484

goto unlock;

5479

goto unlock;

5485

5480

5486

usage = mem_cgroup_usage(memcg, swap);

5481

usage = mem_cgroup_usage(memcg, swap);

5487

5482

5488

/*

5483

/*

5489

* current_threshold points to threshold just below or equal to usage.

5484

* current_threshold points to threshold just below or equal to usage.

5490

* If it's not true, a threshold was crossed after last

5485

* If it's not true, a threshold was crossed after last

5491

* call of __mem_cgroup_threshold().

5486

* call of __mem_cgroup_threshold().

5492

*/

5487

*/

5493

i = t->current_threshold;

5488

i = t->current_threshold;

5494

5489

5495

/*

5490

/*

5496

* Iterate backward over array of thresholds starting from

5491

* Iterate backward over array of thresholds starting from

5497

* current_threshold and check if a threshold is crossed.

5492

* current_threshold and check if a threshold is crossed.

5498

* If none of thresholds below usage is crossed, we read

5493

* If none of thresholds below usage is crossed, we read

5499

* only one element of the array here.

5494

* only one element of the array here.

5500

*/

5495

*/

5501

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

5496

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

5502

eventfd_signal(t->entries[i].eventfd, 1);

5497

eventfd_signal(t->entries[i].eventfd, 1);

5503

5498

5504

/* i = current_threshold + 1 */

5499

/* i = current_threshold + 1 */

5505

i++;

5500

i++;

5506

5501

5507

/*

5502

/*

5508

* Iterate forward over array of thresholds starting from

5503

* Iterate forward over array of thresholds starting from

5509

* current_threshold+1 and check if a threshold is crossed.

5504

* current_threshold+1 and check if a threshold is crossed.

5510

* If none of thresholds above usage is crossed, we read

5505

* If none of thresholds above usage is crossed, we read

5511

* only one element of the array here.

5506

* only one element of the array here.

5512

*/

5507

*/

5513

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

5508

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

5514

eventfd_signal(t->entries[i].eventfd, 1);

5509

eventfd_signal(t->entries[i].eventfd, 1);

5515

5510

5516

/* Update current_threshold */

5511

/* Update current_threshold */

5517

t->current_threshold = i - 1;

5512

t->current_threshold = i - 1;

5518

unlock:

5513

unlock:

5519

rcu_read_unlock();

5514

rcu_read_unlock();

5520

}

5515

}

5521

5516

5522

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

5517

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

5523

{

5518

{

5524

while (memcg) {

5519

while (memcg) {

5525

__mem_cgroup_threshold(memcg, false);

5520

__mem_cgroup_threshold(memcg, false);

5526

if (do_swap_account)

5521

if (do_swap_account)

5527

__mem_cgroup_threshold(memcg, true);

5522

__mem_cgroup_threshold(memcg, true);

5528

5523

5529

memcg = parent_mem_cgroup(memcg);

5524

memcg = parent_mem_cgroup(memcg);

5530

}

5525

}

5531

}

5526

}

5532

5527

5533

static int compare_thresholds(const void *a, const void *b)

5528

static int compare_thresholds(const void *a, const void *b)

5534

{

5529

{

5535

const struct mem_cgroup_threshold *_a = a;

5530

const struct mem_cgroup_threshold *_a = a;

5536

const struct mem_cgroup_threshold *_b = b;

5531

const struct mem_cgroup_threshold *_b = b;

5537

5532

5538

if (_a->threshold > _b->threshold)

5533

if (_a->threshold > _b->threshold)

5539

return 1;

5534

return 1;

5540

5535

5541

if (_a->threshold < _b->threshold)

5536

if (_a->threshold < _b->threshold)

5542

return -1;

5537

return -1;

5543

5538

5544

return 0;

5539

return 0;

5545

}

5540

}

5546

5541

5547

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

5542

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

5548

{

5543

{

5549

struct mem_cgroup_eventfd_list *ev;

5544

struct mem_cgroup_eventfd_list *ev;

5550

5545

5551

list_for_each_entry(ev, &memcg->oom_notify, list)

5546

list_for_each_entry(ev, &memcg->oom_notify, list)

5552

eventfd_signal(ev->eventfd, 1);

5547

eventfd_signal(ev->eventfd, 1);

5553

return 0;

5548

return 0;

5554

}

5549

}

5555

5550

5556

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

5551

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

5557

{

5552

{

5558

struct mem_cgroup *iter;

5553

struct mem_cgroup *iter;

5559

5554

5560

for_each_mem_cgroup_tree(iter, memcg)

5555

for_each_mem_cgroup_tree(iter, memcg)

5561

mem_cgroup_oom_notify_cb(iter);

5556

mem_cgroup_oom_notify_cb(iter);

5562

}

5557

}

5563

5558

5564

static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

5559

static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

5565

struct eventfd_ctx *eventfd, const char *args, enum res_type type)

5560

struct eventfd_ctx *eventfd, const char *args, enum res_type type)

5566

{

5561

{

5567

struct mem_cgroup_thresholds *thresholds;

5562

struct mem_cgroup_thresholds *thresholds;

5568

struct mem_cgroup_threshold_ary *new;

5563

struct mem_cgroup_threshold_ary *new;

5569

u64 threshold, usage;

5564

u64 threshold, usage;

5570

int i, size, ret;

5565

int i, size, ret;

5571

5566

5572

ret = res_counter_memparse_write_strategy(args, &threshold);

5567

ret = res_counter_memparse_write_strategy(args, &threshold);

5573

if (ret)

5568

if (ret)

5574

return ret;

5569

return ret;

5575

5570

5576

mutex_lock(&memcg->thresholds_lock);

5571

mutex_lock(&memcg->thresholds_lock);

5577

5572

5578

if (type == _MEM)

5573

if (type == _MEM)

5579

thresholds = &memcg->thresholds;

5574

thresholds = &memcg->thresholds;

5580

else if (type == _MEMSWAP)

5575

else if (type == _MEMSWAP)

5581

thresholds = &memcg->memsw_thresholds;

5576

thresholds = &memcg->memsw_thresholds;

5582

else

5577

else

5583

BUG();

5578

BUG();

5584

5579

5585

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

5580

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

5586

5581

5587

/* Check if a threshold crossed before adding a new one */

5582

/* Check if a threshold crossed before adding a new one */

5588

if (thresholds->primary)

5583

if (thresholds->primary)

5589

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

5584

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

5590

5585

5591

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

5586

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

5592

5587

5593

/* Allocate memory for new array of thresholds */

5588

/* Allocate memory for new array of thresholds */

5594

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

5589

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

5595

GFP_KERNEL);

5590

GFP_KERNEL);

5596

if (!new) {

5591

if (!new) {

5597

ret = -ENOMEM;

5592

ret = -ENOMEM;

5598

goto unlock;

5593

goto unlock;

5599

}

5594

}

5600

new->size = size;

5595

new->size = size;

5601

5596

5602

/* Copy thresholds (if any) to new array */

5597

/* Copy thresholds (if any) to new array */

5603

if (thresholds->primary) {

5598

if (thresholds->primary) {

5604

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

5599

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

5605

sizeof(struct mem_cgroup_threshold));

5600

sizeof(struct mem_cgroup_threshold));

5606

}

5601

}

5607

5602

5608

/* Add new threshold */

5603

/* Add new threshold */

5609

new->entries[size - 1].eventfd = eventfd;

5604

new->entries[size - 1].eventfd = eventfd;

5610

new->entries[size - 1].threshold = threshold;

5605

new->entries[size - 1].threshold = threshold;

5611

5606

5612

/* Sort thresholds. Registering of new threshold isn't time-critical */

5607

/* Sort thresholds. Registering of new threshold isn't time-critical */

5613

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

5608

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

5614

compare_thresholds, NULL);

5609

compare_thresholds, NULL);

5615

5610

5616

/* Find current threshold */

5611

/* Find current threshold */

5617

new->current_threshold = -1;

5612

new->current_threshold = -1;

5618

for (i = 0; i < size; i++) {

5613

for (i = 0; i < size; i++) {

5619

if (new->entries[i].threshold <= usage) {

5614

if (new->entries[i].threshold <= usage) {

5620

/*

5615

/*

5621

* new->current_threshold will not be used until

5616

* new->current_threshold will not be used until

5622

* rcu_assign_pointer(), so it's safe to increment

5617

* rcu_assign_pointer(), so it's safe to increment

5623

* it here.

5618

* it here.

5624

*/

5619

*/

5625

++new->current_threshold;

5620

++new->current_threshold;

5626

} else

5621

} else

5627

break;

5622

break;

5628

}

5623

}

5629

5624

5630

/* Free old spare buffer and save old primary buffer as spare */

5625

/* Free old spare buffer and save old primary buffer as spare */

5631

kfree(thresholds->spare);

5626

kfree(thresholds->spare);

5632

thresholds->spare = thresholds->primary;

5627

thresholds->spare = thresholds->primary;

5633

5628

5634

rcu_assign_pointer(thresholds->primary, new);

5629

rcu_assign_pointer(thresholds->primary, new);

5635

5630

5636

/* To be sure that nobody uses thresholds */

5631

/* To be sure that nobody uses thresholds */

5637

synchronize_rcu();

5632

synchronize_rcu();

5638

5633

5639

unlock:

5634

unlock:

5640

mutex_unlock(&memcg->thresholds_lock);

5635

mutex_unlock(&memcg->thresholds_lock);

5641

5636

5642

return ret;

5637

return ret;

5643

}

5638

}

5644

5639

5645

static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

5640

static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

5646

struct eventfd_ctx *eventfd, const char *args)

5641

struct eventfd_ctx *eventfd, const char *args)

5647

{

5642

{

5648

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);

5643

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);

5649

}

5644

}

5650

5645

5651

static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,

5646

static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,

5652

struct eventfd_ctx *eventfd, const char *args)

5647

struct eventfd_ctx *eventfd, const char *args)

5653

{

5648

{

5654

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);

5649

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);

5655

}

5650

}

5656

5651

5657

static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

5652

static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

5658

struct eventfd_ctx *eventfd, enum res_type type)

5653

struct eventfd_ctx *eventfd, enum res_type type)

5659

{

5654

{

5660

struct mem_cgroup_thresholds *thresholds;

5655

struct mem_cgroup_thresholds *thresholds;

5661

struct mem_cgroup_threshold_ary *new;

5656

struct mem_cgroup_threshold_ary *new;

5662

u64 usage;

5657

u64 usage;

5663

int i, j, size;

5658

int i, j, size;

5664

5659

5665

mutex_lock(&memcg->thresholds_lock);

5660

mutex_lock(&memcg->thresholds_lock);

5666

if (type == _MEM)

5661

if (type == _MEM)

5667

thresholds = &memcg->thresholds;

5662

thresholds = &memcg->thresholds;

5668

else if (type == _MEMSWAP)

5663

else if (type == _MEMSWAP)

5669

thresholds = &memcg->memsw_thresholds;

5664

thresholds = &memcg->memsw_thresholds;

5670

else

5665

else

5671

BUG();

5666

BUG();

5672

5667

5673

if (!thresholds->primary)

5668

if (!thresholds->primary)

5674

goto unlock;

5669

goto unlock;

5675

5670

5676

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

5671

usage = mem_cgroup_usage(memcg, type == _MEMSWAP);

5677

5672

5678

/* Check if a threshold crossed before removing */

5673

/* Check if a threshold crossed before removing */

5679

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

5674

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

5680

5675

5681

/* Calculate new number of threshold */

5676

/* Calculate new number of threshold */

5682

size = 0;

5677

size = 0;

5683

for (i = 0; i < thresholds->primary->size; i++) {

5678

for (i = 0; i < thresholds->primary->size; i++) {

5684

if (thresholds->primary->entries[i].eventfd != eventfd)

5679

if (thresholds->primary->entries[i].eventfd != eventfd)

5685

size++;

5680

size++;

5686

}

5681

}

5687

5682

5688

new = thresholds->spare;

5683

new = thresholds->spare;

5689

5684

5690

/* Set thresholds array to NULL if we don't have thresholds */

5685

/* Set thresholds array to NULL if we don't have thresholds */

5691

if (!size) {

5686

if (!size) {

5692

kfree(new);

5687

kfree(new);

5693

new = NULL;

5688

new = NULL;

5694

goto swap_buffers;

5689

goto swap_buffers;

5695

}

5690

}

5696

5691

5697

new->size = size;

5692

new->size = size;

5698

5693

5699

/* Copy thresholds and find current threshold */

5694

/* Copy thresholds and find current threshold */

5700

new->current_threshold = -1;

5695

new->current_threshold = -1;

5701

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

5696

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

5702

if (thresholds->primary->entries[i].eventfd == eventfd)

5697

if (thresholds->primary->entries[i].eventfd == eventfd)

5703

continue;

5698

continue;

5704

5699

5705

new->entries[j] = thresholds->primary->entries[i];

5700

new->entries[j] = thresholds->primary->entries[i];

5706

if (new->entries[j].threshold <= usage) {

5701

if (new->entries[j].threshold <= usage) {

5707

/*

5702

/*

5708

* new->current_threshold will not be used

5703

* new->current_threshold will not be used

5709

* until rcu_assign_pointer(), so it's safe to increment

5704

* until rcu_assign_pointer(), so it's safe to increment

5710

* it here.

5705

* it here.

5711

*/

5706

*/

5712

++new->current_threshold;

5707

++new->current_threshold;

5713

}

5708

}

5714

j++;

5709

j++;

5715

}

5710

}

5716

5711

5717

swap_buffers:

5712

swap_buffers:

5718

/* Swap primary and spare array */

5713

/* Swap primary and spare array */

5719

thresholds->spare = thresholds->primary;

5714

thresholds->spare = thresholds->primary;

5720

/* If all events are unregistered, free the spare array */

5715

/* If all events are unregistered, free the spare array */

5721

if (!new) {

5716

if (!new) {

5722

kfree(thresholds->spare);

5717

kfree(thresholds->spare);

5723

thresholds->spare = NULL;

5718

thresholds->spare = NULL;

5724

}

5719

}

5725

5720

5726

rcu_assign_pointer(thresholds->primary, new);

5721

rcu_assign_pointer(thresholds->primary, new);

5727

5722

5728

/* To be sure that nobody uses thresholds */

5723

/* To be sure that nobody uses thresholds */

5729

synchronize_rcu();

5724

synchronize_rcu();

5730

unlock:

5725

unlock:

5731

mutex_unlock(&memcg->thresholds_lock);

5726

mutex_unlock(&memcg->thresholds_lock);

5732

}

5727

}

5733

5728

5734

static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

5729

static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

5735

struct eventfd_ctx *eventfd)

5730

struct eventfd_ctx *eventfd)

5736

{

5731

{

5737

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);

5732

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);

5738

}

5733

}

5739

5734

5740

static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

5735

static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

5741

struct eventfd_ctx *eventfd)

5736

struct eventfd_ctx *eventfd)

5742

{

5737

{

5743

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);

5738

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);

5744

}

5739

}

5745

5740

5746

static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,

5741

static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,

5747

struct eventfd_ctx *eventfd, const char *args)

5742

struct eventfd_ctx *eventfd, const char *args)

5748

{

5743

{

5749

struct mem_cgroup_eventfd_list *event;

5744

struct mem_cgroup_eventfd_list *event;

5750

5745

5751

event = kmalloc(sizeof(*event), GFP_KERNEL);

5746

event = kmalloc(sizeof(*event), GFP_KERNEL);

5752

if (!event)

5747

if (!event)

5753

return -ENOMEM;

5748

return -ENOMEM;

5754

5749

5755

spin_lock(&memcg_oom_lock);

5750

spin_lock(&memcg_oom_lock);

5756

5751

5757

event->eventfd = eventfd;

5752

event->eventfd = eventfd;

5758

list_add(&event->list, &memcg->oom_notify);

5753

list_add(&event->list, &memcg->oom_notify);

5759

5754

5760

/* already in OOM ? */

5755

/* already in OOM ? */

5761

if (atomic_read(&memcg->under_oom))

5756

if (atomic_read(&memcg->under_oom))

5762

eventfd_signal(eventfd, 1);

5757

eventfd_signal(eventfd, 1);

5763

spin_unlock(&memcg_oom_lock);

5758

spin_unlock(&memcg_oom_lock);

5764

5759

5765

return 0;

5760

return 0;

5766

}

5761

}

5767

5762

5768

static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,

5763

static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,

5769

struct eventfd_ctx *eventfd)

5764

struct eventfd_ctx *eventfd)

5770

{

5765

{

5771

struct mem_cgroup_eventfd_list *ev, *tmp;

5766

struct mem_cgroup_eventfd_list *ev, *tmp;

5772

5767

5773

spin_lock(&memcg_oom_lock);

5768

spin_lock(&memcg_oom_lock);

5774

5769

5775

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

5770

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

5776

if (ev->eventfd == eventfd) {

5771

if (ev->eventfd == eventfd) {

5777

list_del(&ev->list);

5772

list_del(&ev->list);

5778

kfree(ev);

5773

kfree(ev);

5779

}

5774

}

5780

}

5775

}

5781

5776

5782

spin_unlock(&memcg_oom_lock);

5777

spin_unlock(&memcg_oom_lock);

5783

}

5778

}

5784

5779

5785

static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)

5780

static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)

5786

{

5781

{

5787

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));

5782

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));

5788

5783

5789

seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);

5784

seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);

5790

seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));

5785

seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));

5791

return 0;

5786

return 0;

5792

}

5787

}

5793

5788

5794

static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,

5789

static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,

5795

struct cftype *cft, u64 val)

5790

struct cftype *cft, u64 val)

5796

{

5791

{

5797

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5792

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5798

struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));

5793

struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));

5799

5794

5800

/* cannot set to root cgroup and only 0 and 1 are allowed */

5795

/* cannot set to root cgroup and only 0 and 1 are allowed */

5801

if (!parent || !((val == 0) || (val == 1)))

5796

if (!parent || !((val == 0) || (val == 1)))

5802

return -EINVAL;

5797

return -EINVAL;

5803

5798

5804

mutex_lock(&memcg_create_mutex);

5799

mutex_lock(&memcg_create_mutex);

5805

/* oom-kill-disable is a flag for subhierarchy. */

5800

/* oom-kill-disable is a flag for subhierarchy. */

5806

if ((parent->use_hierarchy) || memcg_has_children(memcg)) {

5801

if ((parent->use_hierarchy) || memcg_has_children(memcg)) {

5807

mutex_unlock(&memcg_create_mutex);

5802

mutex_unlock(&memcg_create_mutex);

5808

return -EINVAL;

5803

return -EINVAL;

5809

}

5804

}

5810

memcg->oom_kill_disable = val;

5805

memcg->oom_kill_disable = val;

5811

if (!val)

5806

if (!val)

5812

memcg_oom_recover(memcg);

5807

memcg_oom_recover(memcg);

5813

mutex_unlock(&memcg_create_mutex);

5808

mutex_unlock(&memcg_create_mutex);

5814

return 0;

5809

return 0;

5815

}

5810

}

5816

5811

5817

#ifdef CONFIG_MEMCG_KMEM

5812

#ifdef CONFIG_MEMCG_KMEM

5818

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

5813

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

5819

{

5814

{

5820

int ret;

5815

int ret;

5821

5816

5822

memcg->kmemcg_id = -1;

5817

memcg->kmemcg_id = -1;

5823

ret = memcg_propagate_kmem(memcg);

5818

ret = memcg_propagate_kmem(memcg);

5824

if (ret)

5819

if (ret)

5825

return ret;

5820

return ret;

5826

5821

5827

return mem_cgroup_sockets_init(memcg, ss);

5822

return mem_cgroup_sockets_init(memcg, ss);

5828

}

5823

}

5829

5824

5830

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

5825

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

5831

{

5826

{

5832

mem_cgroup_sockets_destroy(memcg);

5827

mem_cgroup_sockets_destroy(memcg);

5833

}

5828

}

5834

5829

5835

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

5830

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

5836

{

5831

{

5837

if (!memcg_kmem_is_active(memcg))

5832

if (!memcg_kmem_is_active(memcg))

5838

return;

5833

return;

5839

5834

5840

/*

5835

/*

5841

* kmem charges can outlive the cgroup. In the case of slab

5836

* kmem charges can outlive the cgroup. In the case of slab

5842

* pages, for instance, a page contain objects from various

5837

* pages, for instance, a page contain objects from various

5843

* processes. As we prevent from taking a reference for every

5838

* processes. As we prevent from taking a reference for every

5844

* such allocation we have to be careful when doing uncharge

5839

* such allocation we have to be careful when doing uncharge

5845

* (see memcg_uncharge_kmem) and here during offlining.

5840

* (see memcg_uncharge_kmem) and here during offlining.

5846

*

5841

*

5847

* The idea is that that only the _last_ uncharge which sees

5842

* The idea is that that only the _last_ uncharge which sees

5848

* the dead memcg will drop the last reference. An additional

5843

* the dead memcg will drop the last reference. An additional

5849

* reference is taken here before the group is marked dead

5844

* reference is taken here before the group is marked dead

5850

* which is then paired with css_put during uncharge resp. here.

5845

* which is then paired with css_put during uncharge resp. here.

5851

*

5846

*

5852

* Although this might sound strange as this path is called from

5847

* Although this might sound strange as this path is called from

5853

* css_offline() when the referencemight have dropped down to 0

5848

* css_offline() when the referencemight have dropped down to 0

5854

* and shouldn't be incremented anymore (css_tryget would fail)

5849

* and shouldn't be incremented anymore (css_tryget would fail)

5855

* we do not have other options because of the kmem allocations

5850

* we do not have other options because of the kmem allocations

5856

* lifetime.

5851

* lifetime.

5857

*/

5852

*/

5858

css_get(&memcg->css);

5853

css_get(&memcg->css);

5859

5854

5860

memcg_kmem_mark_dead(memcg);

5855

memcg_kmem_mark_dead(memcg);

5861

5856

5862

if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)

5857

if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)

5863

return;

5858

return;

5864

5859

5865

if (memcg_kmem_test_and_clear_dead(memcg))

5860

if (memcg_kmem_test_and_clear_dead(memcg))

5866

css_put(&memcg->css);

5861

css_put(&memcg->css);

5867

}

5862

}

5868

#else

5863

#else

5869

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

5864

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

5870

{

5865

{

5871

return 0;

5866

return 0;

5872

}

5867

}

5873

5868

5874

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

5869

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

5875

{

5870

{

5876

}

5871

}

5877

5872

5878

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

5873

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

5879

{

5874

{

5880

}

5875

}

5881

#endif

5876

#endif

5882

5877

5883

/*

5878

/*

5884

* DO NOT USE IN NEW FILES.

5879

* DO NOT USE IN NEW FILES.

5885

*

5880

*

5886

* "cgroup.event_control" implementation.

5881

* "cgroup.event_control" implementation.

5887

*

5882

*

5888

* This is way over-engineered. It tries to support fully configurable

5883

* This is way over-engineered. It tries to support fully configurable

5889

* events for each user. Such level of flexibility is completely

5884

* events for each user. Such level of flexibility is completely

5890

* unnecessary especially in the light of the planned unified hierarchy.

5885

* unnecessary especially in the light of the planned unified hierarchy.

5891

*

5886

*

5892

* Please deprecate this and replace with something simpler if at all

5887

* Please deprecate this and replace with something simpler if at all

5893

* possible.

5888

* possible.

5894

*/

5889

*/

5895

5890

5896

/*

5891

/*

5897

* Unregister event and free resources.

5892

* Unregister event and free resources.

5898

*

5893

*

5899

* Gets called from workqueue.

5894

* Gets called from workqueue.

5900

*/

5895

*/

5901

static void memcg_event_remove(struct work_struct *work)

5896

static void memcg_event_remove(struct work_struct *work)

5902

{

5897

{

5903

struct mem_cgroup_event *event =

5898

struct mem_cgroup_event *event =

5904

container_of(work, struct mem_cgroup_event, remove);

5899

container_of(work, struct mem_cgroup_event, remove);

5905

struct mem_cgroup *memcg = event->memcg;

5900

struct mem_cgroup *memcg = event->memcg;

5906

5901

5907

remove_wait_queue(event->wqh, &event->wait);

5902

remove_wait_queue(event->wqh, &event->wait);

5908

5903

5909

event->unregister_event(memcg, event->eventfd);

5904

event->unregister_event(memcg, event->eventfd);

5910

5905

5911

/* Notify userspace the event is going away. */

5906

/* Notify userspace the event is going away. */

5912

eventfd_signal(event->eventfd, 1);

5907

eventfd_signal(event->eventfd, 1);

5913

5908

5914

eventfd_ctx_put(event->eventfd);

5909

eventfd_ctx_put(event->eventfd);

5915

kfree(event);

5910

kfree(event);

5916

css_put(&memcg->css);

5911

css_put(&memcg->css);

5917

}

5912

}

5918

5913

5919

/*

5914

/*

5920

* Gets called on POLLHUP on eventfd when user closes it.

5915

* Gets called on POLLHUP on eventfd when user closes it.

5921

*

5916

*

5922

* Called with wqh->lock held and interrupts disabled.

5917

* Called with wqh->lock held and interrupts disabled.

5923

*/

5918

*/

5924

static int memcg_event_wake(wait_queue_t *wait, unsigned mode,

5919

static int memcg_event_wake(wait_queue_t *wait, unsigned mode,

5925

int sync, void *key)

5920

int sync, void *key)

5926

{

5921

{

5927

struct mem_cgroup_event *event =

5922

struct mem_cgroup_event *event =

5928

container_of(wait, struct mem_cgroup_event, wait);

5923

container_of(wait, struct mem_cgroup_event, wait);

5929

struct mem_cgroup *memcg = event->memcg;

5924

struct mem_cgroup *memcg = event->memcg;

5930

unsigned long flags = (unsigned long)key;

5925

unsigned long flags = (unsigned long)key;

5931

5926

5932

if (flags & POLLHUP) {

5927

if (flags & POLLHUP) {

5933

/*

5928

/*

5934

* If the event has been detached at cgroup removal, we

5929

* If the event has been detached at cgroup removal, we

5935

* can simply return knowing the other side will cleanup

5930

* can simply return knowing the other side will cleanup

5936

* for us.

5931

* for us.

5937

*

5932

*

5938

* We can't race against event freeing since the other

5933

* We can't race against event freeing since the other

5939

* side will require wqh->lock via remove_wait_queue(),

5934

* side will require wqh->lock via remove_wait_queue(),

5940

* which we hold.

5935

* which we hold.

5941

*/

5936

*/

5942

spin_lock(&memcg->event_list_lock);

5937

spin_lock(&memcg->event_list_lock);

5943

if (!list_empty(&event->list)) {

5938

if (!list_empty(&event->list)) {

5944

list_del_init(&event->list);

5939

list_del_init(&event->list);

5945

/*

5940

/*

5946

* We are in atomic context, but cgroup_event_remove()

5941

* We are in atomic context, but cgroup_event_remove()

5947

* may sleep, so we have to call it in workqueue.

5942

* may sleep, so we have to call it in workqueue.

5948

*/

5943

*/

5949

schedule_work(&event->remove);

5944

schedule_work(&event->remove);

5950

}

5945

}

5951

spin_unlock(&memcg->event_list_lock);

5946

spin_unlock(&memcg->event_list_lock);

5952

}

5947

}

5953

5948

5954

return 0;

5949

return 0;

5955

}

5950

}

5956

5951

5957

static void memcg_event_ptable_queue_proc(struct file *file,

5952

static void memcg_event_ptable_queue_proc(struct file *file,

5958

wait_queue_head_t *wqh, poll_table *pt)

5953

wait_queue_head_t *wqh, poll_table *pt)

5959

{

5954

{

5960

struct mem_cgroup_event *event =

5955

struct mem_cgroup_event *event =

5961

container_of(pt, struct mem_cgroup_event, pt);

5956

container_of(pt, struct mem_cgroup_event, pt);

5962

5957

5963

event->wqh = wqh;

5958

event->wqh = wqh;

5964

add_wait_queue(wqh, &event->wait);

5959

add_wait_queue(wqh, &event->wait);

5965

}

5960

}

5966

5961

5967

/*

5962

/*

5968

* DO NOT USE IN NEW FILES.

5963

* DO NOT USE IN NEW FILES.

5969

*

5964

*

5970

* Parse input and register new cgroup event handler.

5965

* Parse input and register new cgroup event handler.

5971

*

5966

*

5972

* Input must be in format '<event_fd> <control_fd> <args>'.

5967

* Input must be in format '<event_fd> <control_fd> <args>'.

5973

* Interpretation of args is defined by control file implementation.

5968

* Interpretation of args is defined by control file implementation.

5974

*/

5969

*/

5975

static int memcg_write_event_control(struct cgroup_subsys_state *css,

5970

static int memcg_write_event_control(struct cgroup_subsys_state *css,

5976

struct cftype *cft, char *buffer)

5971

struct cftype *cft, char *buffer)

5977

{

5972

{

5978

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5973

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5979

struct mem_cgroup_event *event;

5974

struct mem_cgroup_event *event;

5980

struct cgroup_subsys_state *cfile_css;

5975

struct cgroup_subsys_state *cfile_css;

5981

unsigned int efd, cfd;

5976

unsigned int efd, cfd;

5982

struct fd efile;

5977

struct fd efile;

5983

struct fd cfile;

5978

struct fd cfile;

5984

const char *name;

5979

const char *name;

5985

char *endp;

5980

char *endp;

5986

int ret;

5981

int ret;

5987

5982

5988

efd = simple_strtoul(buffer, &endp, 10);

5983

efd = simple_strtoul(buffer, &endp, 10);

5989

if (*endp != ' ')

5984

if (*endp != ' ')

5990

return -EINVAL;

5985

return -EINVAL;

5991

buffer = endp + 1;

5986

buffer = endp + 1;

5992

5987

5993

cfd = simple_strtoul(buffer, &endp, 10);

5988

cfd = simple_strtoul(buffer, &endp, 10);

5994

if ((*endp != ' ') && (*endp != '\0'))

5989

if ((*endp != ' ') && (*endp != '\0'))

5995

return -EINVAL;

5990

return -EINVAL;

5996

buffer = endp + 1;

5991

buffer = endp + 1;

5997

5992

5998

event = kzalloc(sizeof(*event), GFP_KERNEL);

5993

event = kzalloc(sizeof(*event), GFP_KERNEL);

5999

if (!event)

5994

if (!event)

6000

return -ENOMEM;

5995

return -ENOMEM;

6001

5996

6002

event->memcg = memcg;

5997

event->memcg = memcg;

6003

INIT_LIST_HEAD(&event->list);

5998

INIT_LIST_HEAD(&event->list);

6004

init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);

5999

init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);

6005

init_waitqueue_func_entry(&event->wait, memcg_event_wake);

6000

init_waitqueue_func_entry(&event->wait, memcg_event_wake);

6006

INIT_WORK(&event->remove, memcg_event_remove);

6001

INIT_WORK(&event->remove, memcg_event_remove);

6007

6002

6008

efile = fdget(efd);

6003

efile = fdget(efd);

6009

if (!efile.file) {

6004

if (!efile.file) {

6010

ret = -EBADF;

6005

ret = -EBADF;

6011

goto out_kfree;

6006

goto out_kfree;

6012

}

6007

}

6013

6008

6014

event->eventfd = eventfd_ctx_fileget(efile.file);

6009

event->eventfd = eventfd_ctx_fileget(efile.file);

6015

if (IS_ERR(event->eventfd)) {

6010

if (IS_ERR(event->eventfd)) {

6016

ret = PTR_ERR(event->eventfd);

6011

ret = PTR_ERR(event->eventfd);

6017

goto out_put_efile;

6012

goto out_put_efile;

6018

}

6013

}

6019

6014

6020

cfile = fdget(cfd);

6015

cfile = fdget(cfd);

6021

if (!cfile.file) {

6016

if (!cfile.file) {

6022

ret = -EBADF;

6017

ret = -EBADF;

6023

goto out_put_eventfd;

6018

goto out_put_eventfd;

6024

}

6019

}

6025

6020

6026

/* the process need read permission on control file */

6021

/* the process need read permission on control file */

6027

/* AV: shouldn't we check that it's been opened for read instead? */

6022

/* AV: shouldn't we check that it's been opened for read instead? */

6028

ret = inode_permission(file_inode(cfile.file), MAY_READ);

6023

ret = inode_permission(file_inode(cfile.file), MAY_READ);

6029

if (ret < 0)

6024

if (ret < 0)

6030

goto out_put_cfile;

6025

goto out_put_cfile;

6031

6026

6032

/*

6027

/*

6033

* Determine the event callbacks and set them in @event. This used

6028

* Determine the event callbacks and set them in @event. This used

6034

* to be done via struct cftype but cgroup core no longer knows

6029

* to be done via struct cftype but cgroup core no longer knows

6035

* about these events. The following is crude but the whole thing

6030

* about these events. The following is crude but the whole thing

6036

* is for compatibility anyway.

6031

* is for compatibility anyway.

6037

*

6032

*

6038

* DO NOT ADD NEW FILES.

6033

* DO NOT ADD NEW FILES.

6039

*/

6034

*/

6040

name = cfile.file->f_dentry->d_name.name;

6035

name = cfile.file->f_dentry->d_name.name;

6041

6036

6042

if (!strcmp(name, "memory.usage_in_bytes")) {

6037

if (!strcmp(name, "memory.usage_in_bytes")) {

6043

event->register_event = mem_cgroup_usage_register_event;

6038

event->register_event = mem_cgroup_usage_register_event;

6044

event->unregister_event = mem_cgroup_usage_unregister_event;

6039

event->unregister_event = mem_cgroup_usage_unregister_event;

6045

} else if (!strcmp(name, "memory.oom_control")) {

6040

} else if (!strcmp(name, "memory.oom_control")) {

6046

event->register_event = mem_cgroup_oom_register_event;

6041

event->register_event = mem_cgroup_oom_register_event;

6047

event->unregister_event = mem_cgroup_oom_unregister_event;

6042

event->unregister_event = mem_cgroup_oom_unregister_event;

6048

} else if (!strcmp(name, "memory.pressure_level")) {

6043

} else if (!strcmp(name, "memory.pressure_level")) {

6049

event->register_event = vmpressure_register_event;

6044

event->register_event = vmpressure_register_event;

6050

event->unregister_event = vmpressure_unregister_event;

6045

event->unregister_event = vmpressure_unregister_event;

6051

} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {

6046

} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {

6052

event->register_event = memsw_cgroup_usage_register_event;

6047

event->register_event = memsw_cgroup_usage_register_event;

6053

event->unregister_event = memsw_cgroup_usage_unregister_event;

6048

event->unregister_event = memsw_cgroup_usage_unregister_event;

6054

} else {

6049

} else {

6055

ret = -EINVAL;

6050

ret = -EINVAL;

6056

goto out_put_cfile;

6051

goto out_put_cfile;

6057

}

6052

}

6058

6053

6059

/*

6054

/*

6060

* Verify @cfile should belong to @css. Also, remaining events are

6055

* Verify @cfile should belong to @css. Also, remaining events are

6061

* automatically removed on cgroup destruction but the removal is

6056

* automatically removed on cgroup destruction but the removal is

6062

* asynchronous, so take an extra ref on @css.

6057

* asynchronous, so take an extra ref on @css.

6063

*/

6058

*/

6064

cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,

6059

cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,

6065

&memory_cgrp_subsys);

6060

&memory_cgrp_subsys);

6066

ret = -EINVAL;

6061

ret = -EINVAL;

6067

if (IS_ERR(cfile_css))

6062

if (IS_ERR(cfile_css))

6068

goto out_put_cfile;

6063

goto out_put_cfile;

6069

if (cfile_css != css) {

6064

if (cfile_css != css) {

6070

css_put(cfile_css);

6065

css_put(cfile_css);

6071

goto out_put_cfile;

6066

goto out_put_cfile;

6072

}

6067

}

6073

6068

6074

ret = event->register_event(memcg, event->eventfd, buffer);

6069

ret = event->register_event(memcg, event->eventfd, buffer);

6075

if (ret)

6070

if (ret)

6076

goto out_put_css;

6071

goto out_put_css;

6077

6072

6078

efile.file->f_op->poll(efile.file, &event->pt);

6073

efile.file->f_op->poll(efile.file, &event->pt);

6079

6074

6080

spin_lock(&memcg->event_list_lock);

6075

spin_lock(&memcg->event_list_lock);

6081

list_add(&event->list, &memcg->event_list);

6076

list_add(&event->list, &memcg->event_list);

6082

spin_unlock(&memcg->event_list_lock);

6077

spin_unlock(&memcg->event_list_lock);

6083

6078

6084

fdput(cfile);

6079

fdput(cfile);

6085

fdput(efile);

6080

fdput(efile);

6086

6081

6087

return 0;

6082

return 0;

6088

6083

6089

out_put_css:

6084

out_put_css:

6090

css_put(css);

6085

css_put(css);

6091

out_put_cfile:

6086

out_put_cfile:

6092

fdput(cfile);

6087

fdput(cfile);

6093

out_put_eventfd:

6088

out_put_eventfd:

6094

eventfd_ctx_put(event->eventfd);

6089

eventfd_ctx_put(event->eventfd);

6095

out_put_efile:

6090

out_put_efile:

6096

fdput(efile);

6091

fdput(efile);

6097

out_kfree:

6092

out_kfree:

6098

kfree(event);

6093

kfree(event);

6099

6094

6100

return ret;

6095

return ret;

6101

}

6096

}

6102

6097

6103

static struct cftype mem_cgroup_files[] = {

6098

static struct cftype mem_cgroup_files[] = {

6104

{

6099

{

6105

.name = "usage_in_bytes",

6100

.name = "usage_in_bytes",

6106

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

6101

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

6107

.read_u64 = mem_cgroup_read_u64,

6102

.read_u64 = mem_cgroup_read_u64,

6108

},

6103

},

6109

{

6104

{

6110

.name = "max_usage_in_bytes",

6105

.name = "max_usage_in_bytes",

6111

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

6106

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

6112

.trigger = mem_cgroup_reset,

6107

.trigger = mem_cgroup_reset,

6113

.read_u64 = mem_cgroup_read_u64,

6108

.read_u64 = mem_cgroup_read_u64,

6114

},

6109

},

6115

{

6110

{

6116

.name = "limit_in_bytes",

6111

.name = "limit_in_bytes",

6117

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

6112

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

6118

.write_string = mem_cgroup_write,

6113

.write_string = mem_cgroup_write,

6119

.read_u64 = mem_cgroup_read_u64,

6114

.read_u64 = mem_cgroup_read_u64,

6120

},

6115

},

6121

{

6116

{

6122

.name = "soft_limit_in_bytes",

6117

.name = "soft_limit_in_bytes",

6123

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

6118

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

6124

.write_string = mem_cgroup_write,

6119

.write_string = mem_cgroup_write,

6125

.read_u64 = mem_cgroup_read_u64,

6120

.read_u64 = mem_cgroup_read_u64,

6126

},

6121

},

6127

{

6122

{

6128

.name = "failcnt",

6123

.name = "failcnt",

6129

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

6124

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

6130

.trigger = mem_cgroup_reset,

6125

.trigger = mem_cgroup_reset,

6131

.read_u64 = mem_cgroup_read_u64,

6126

.read_u64 = mem_cgroup_read_u64,

6132

},

6127

},

6133

{

6128

{

6134

.name = "stat",

6129

.name = "stat",

6135

.seq_show = memcg_stat_show,

6130

.seq_show = memcg_stat_show,

6136

},

6131

},

6137

{

6132

{

6138

.name = "force_empty",

6133

.name = "force_empty",

6139

.trigger = mem_cgroup_force_empty_write,

6134

.trigger = mem_cgroup_force_empty_write,

6140

},

6135

},

6141

{

6136

{

6142

.name = "use_hierarchy",

6137

.name = "use_hierarchy",

6143

.flags = CFTYPE_INSANE,

6138

.flags = CFTYPE_INSANE,

6144

.write_u64 = mem_cgroup_hierarchy_write,

6139

.write_u64 = mem_cgroup_hierarchy_write,

6145

.read_u64 = mem_cgroup_hierarchy_read,

6140

.read_u64 = mem_cgroup_hierarchy_read,

6146

},

6141

},

6147

{

6142

{

6148

.name = "cgroup.event_control", /* XXX: for compat */

6143

.name = "cgroup.event_control", /* XXX: for compat */

6149

.write_string = memcg_write_event_control,

6144

.write_string = memcg_write_event_control,

6150

.flags = CFTYPE_NO_PREFIX,

6145

.flags = CFTYPE_NO_PREFIX,

6151

.mode = S_IWUGO,

6146

.mode = S_IWUGO,

6152

},

6147

},

6153

{

6148

{

6154

.name = "swappiness",

6149

.name = "swappiness",

6155

.read_u64 = mem_cgroup_swappiness_read,

6150

.read_u64 = mem_cgroup_swappiness_read,

6156

.write_u64 = mem_cgroup_swappiness_write,

6151

.write_u64 = mem_cgroup_swappiness_write,

6157

},

6152

},

6158

{

6153

{

6159

.name = "move_charge_at_immigrate",

6154

.name = "move_charge_at_immigrate",

6160

.read_u64 = mem_cgroup_move_charge_read,

6155

.read_u64 = mem_cgroup_move_charge_read,

6161

.write_u64 = mem_cgroup_move_charge_write,

6156

.write_u64 = mem_cgroup_move_charge_write,

6162

},

6157

},

6163

{

6158

{

6164

.name = "oom_control",

6159

.name = "oom_control",

6165

.seq_show = mem_cgroup_oom_control_read,

6160

.seq_show = mem_cgroup_oom_control_read,

6166

.write_u64 = mem_cgroup_oom_control_write,

6161

.write_u64 = mem_cgroup_oom_control_write,

6167

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

6162

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

6168

},

6163

},

6169

{

6164

{

6170

.name = "pressure_level",

6165

.name = "pressure_level",

6171

},

6166

},

6172

#ifdef CONFIG_NUMA

6167

#ifdef CONFIG_NUMA

6173

{

6168

{

6174

.name = "numa_stat",

6169

.name = "numa_stat",

6175

.seq_show = memcg_numa_stat_show,

6170

.seq_show = memcg_numa_stat_show,

6176

},

6171

},

6177

#endif

6172

#endif

6178

#ifdef CONFIG_MEMCG_KMEM

6173

#ifdef CONFIG_MEMCG_KMEM

6179

{

6174

{

6180

.name = "kmem.limit_in_bytes",

6175

.name = "kmem.limit_in_bytes",

6181

.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),

6176

.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),

6182

.write_string = mem_cgroup_write,

6177

.write_string = mem_cgroup_write,

6183

.read_u64 = mem_cgroup_read_u64,

6178

.read_u64 = mem_cgroup_read_u64,

6184

},

6179

},

6185

{

6180

{

6186

.name = "kmem.usage_in_bytes",

6181

.name = "kmem.usage_in_bytes",

6187

.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),

6182

.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),

6188

.read_u64 = mem_cgroup_read_u64,

6183

.read_u64 = mem_cgroup_read_u64,

6189

},

6184

},

6190

{

6185

{

6191

.name = "kmem.failcnt",

6186

.name = "kmem.failcnt",

6192

.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),

6187

.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),

6193

.trigger = mem_cgroup_reset,

6188

.trigger = mem_cgroup_reset,

6194

.read_u64 = mem_cgroup_read_u64,

6189

.read_u64 = mem_cgroup_read_u64,

6195

},

6190

},

6196

{

6191

{

6197

.name = "kmem.max_usage_in_bytes",

6192

.name = "kmem.max_usage_in_bytes",

6198

.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),

6193

.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),

6199

.trigger = mem_cgroup_reset,

6194

.trigger = mem_cgroup_reset,

6200

.read_u64 = mem_cgroup_read_u64,

6195

.read_u64 = mem_cgroup_read_u64,

6201

},

6196

},

6202

#ifdef CONFIG_SLABINFO

6197

#ifdef CONFIG_SLABINFO

6203

{

6198

{

6204

.name = "kmem.slabinfo",

6199

.name = "kmem.slabinfo",

6205

.seq_show = mem_cgroup_slabinfo_read,

6200

.seq_show = mem_cgroup_slabinfo_read,

6206

},

6201

},

6207

#endif

6202

#endif

6208

#endif

6203

#endif

6209

{ }, /* terminate */

6204

{ }, /* terminate */

6210

};

6205

};

6211

6206

6212

#ifdef CONFIG_MEMCG_SWAP

6207

#ifdef CONFIG_MEMCG_SWAP

6213

static struct cftype memsw_cgroup_files[] = {

6208

static struct cftype memsw_cgroup_files[] = {

6214

{

6209

{

6215

.name = "memsw.usage_in_bytes",

6210

.name = "memsw.usage_in_bytes",

6216

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

6211

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

6217

.read_u64 = mem_cgroup_read_u64,

6212

.read_u64 = mem_cgroup_read_u64,

6218

},

6213

},

6219

{

6214

{

6220

.name = "memsw.max_usage_in_bytes",

6215

.name = "memsw.max_usage_in_bytes",

6221

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

6216

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

6222

.trigger = mem_cgroup_reset,

6217

.trigger = mem_cgroup_reset,

6223

.read_u64 = mem_cgroup_read_u64,

6218

.read_u64 = mem_cgroup_read_u64,

6224

},

6219

},

6225

{

6220

{

6226

.name = "memsw.limit_in_bytes",

6221

.name = "memsw.limit_in_bytes",

6227

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

6222

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

6228

.write_string = mem_cgroup_write,

6223

.write_string = mem_cgroup_write,

6229

.read_u64 = mem_cgroup_read_u64,

6224

.read_u64 = mem_cgroup_read_u64,

6230

},

6225

},

6231

{

6226

{

6232

.name = "memsw.failcnt",

6227

.name = "memsw.failcnt",

6233

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

6228

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

6234

.trigger = mem_cgroup_reset,

6229

.trigger = mem_cgroup_reset,

6235

.read_u64 = mem_cgroup_read_u64,

6230

.read_u64 = mem_cgroup_read_u64,

6236

},

6231

},

6237

{ }, /* terminate */

6232

{ }, /* terminate */

6238

};

6233

};

6239

#endif

6234

#endif

6240

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

6235

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

6241

{

6236

{

6242

struct mem_cgroup_per_node *pn;

6237

struct mem_cgroup_per_node *pn;

6243

struct mem_cgroup_per_zone *mz;

6238

struct mem_cgroup_per_zone *mz;

6244

int zone, tmp = node;

6239

int zone, tmp = node;

6245

/*

6240

/*

6246

* This routine is called against possible nodes.

6241

* This routine is called against possible nodes.

6247

* But it's BUG to call kmalloc() against offline node.

6242

* But it's BUG to call kmalloc() against offline node.

6248

*

6243

*

6249

* TODO: this routine can waste much memory for nodes which will

6244

* TODO: this routine can waste much memory for nodes which will

6250

* never be onlined. It's better to use memory hotplug callback

6245

* never be onlined. It's better to use memory hotplug callback

6251

* function.

6246

* function.

6252

*/

6247

*/

6253

if (!node_state(node, N_NORMAL_MEMORY))

6248

if (!node_state(node, N_NORMAL_MEMORY))

6254

tmp = -1;

6249

tmp = -1;

6255

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

6250

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

6256

if (!pn)

6251

if (!pn)

6257

return 1;

6252

return 1;

6258

6253

6259

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

6254

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

6260

mz = &pn->zoneinfo[zone];

6255

mz = &pn->zoneinfo[zone];

6261

lruvec_init(&mz->lruvec);

6256

lruvec_init(&mz->lruvec);

6262

mz->usage_in_excess = 0;

6257

mz->usage_in_excess = 0;

6263

mz->on_tree = false;

6258

mz->on_tree = false;

6264

mz->memcg = memcg;

6259

mz->memcg = memcg;

6265

}

6260

}

6266

memcg->nodeinfo[node] = pn;

6261

memcg->nodeinfo[node] = pn;

6267

return 0;

6262

return 0;

6268

}

6263

}

6269

6264

6270

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

6265

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

6271

{

6266

{

6272

kfree(memcg->nodeinfo[node]);

6267

kfree(memcg->nodeinfo[node]);

6273

}

6268

}

6274

6269

6275

static struct mem_cgroup *mem_cgroup_alloc(void)

6270

static struct mem_cgroup *mem_cgroup_alloc(void)

6276

{

6271

{

6277

struct mem_cgroup *memcg;

6272

struct mem_cgroup *memcg;

6278

size_t size;

6273

size_t size;

6279

6274

6280

size = sizeof(struct mem_cgroup);

6275

size = sizeof(struct mem_cgroup);

6281

size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);

6276

size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);

6282

6277

6283

memcg = kzalloc(size, GFP_KERNEL);

6278

memcg = kzalloc(size, GFP_KERNEL);

6284

if (!memcg)

6279

if (!memcg)

6285

return NULL;

6280

return NULL;

6286

6281

6287

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

6282

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

6288

if (!memcg->stat)

6283

if (!memcg->stat)

6289

goto out_free;

6284

goto out_free;

6290

spin_lock_init(&memcg->pcp_counter_lock);

6285

spin_lock_init(&memcg->pcp_counter_lock);

6291

return memcg;

6286

return memcg;

6292

6287

6293

out_free:

6288

out_free:

6294

kfree(memcg);

6289

kfree(memcg);

6295

return NULL;

6290

return NULL;

6296

}

6291

}

6297

6292

6298

/*

6293

/*

6299

* At destroying mem_cgroup, references from swap_cgroup can remain.

6294

* At destroying mem_cgroup, references from swap_cgroup can remain.

6300

* (scanning all at force_empty is too costly...)

6295

* (scanning all at force_empty is too costly...)

6301

*

6296

*

6302

* Instead of clearing all references at force_empty, we remember

6297

* Instead of clearing all references at force_empty, we remember

6303

* the number of reference from swap_cgroup and free mem_cgroup when

6298

* the number of reference from swap_cgroup and free mem_cgroup when

6304

* it goes down to 0.

6299

* it goes down to 0.

6305

*

6300

*

6306

* Removal of cgroup itself succeeds regardless of refs from swap.

6301

* Removal of cgroup itself succeeds regardless of refs from swap.

6307

*/

6302

*/

6308

6303

6309

static void __mem_cgroup_free(struct mem_cgroup *memcg)

6304

static void __mem_cgroup_free(struct mem_cgroup *memcg)

6310

{

6305

{

6311

int node;

6306

int node;

6312

6307

6313

mem_cgroup_remove_from_trees(memcg);

6308

mem_cgroup_remove_from_trees(memcg);

6314

6309

6315

for_each_node(node)

6310

for_each_node(node)

6316

free_mem_cgroup_per_zone_info(memcg, node);

6311

free_mem_cgroup_per_zone_info(memcg, node);

6317

6312

6318

free_percpu(memcg->stat);

6313

free_percpu(memcg->stat);

6319

6314

6320

/*

6315

/*

6321

* We need to make sure that (at least for now), the jump label

6316

* We need to make sure that (at least for now), the jump label

6322

* destruction code runs outside of the cgroup lock. This is because

6317

* destruction code runs outside of the cgroup lock. This is because

6323

* get_online_cpus(), which is called from the static_branch update,

6318

* get_online_cpus(), which is called from the static_branch update,

6324

* can't be called inside the cgroup_lock. cpusets are the ones

6319

* can't be called inside the cgroup_lock. cpusets are the ones

6325

* enforcing this dependency, so if they ever change, we might as well.

6320

* enforcing this dependency, so if they ever change, we might as well.

6326

*

6321

*

6327

* schedule_work() will guarantee this happens. Be careful if you need

6322

* schedule_work() will guarantee this happens. Be careful if you need

6328

* to move this code around, and make sure it is outside

6323

* to move this code around, and make sure it is outside

6329

* the cgroup_lock.

6324

* the cgroup_lock.

6330

*/

6325

*/

6331

disarm_static_keys(memcg);

6326

disarm_static_keys(memcg);

6332

kfree(memcg);

6327

kfree(memcg);

6333

}

6328

}

6334

6329

6335

/*

6330

/*

6336

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

6331

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

6337

*/

6332

*/

6338

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

6333

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

6339

{

6334

{

6340

if (!memcg->res.parent)

6335

if (!memcg->res.parent)

6341

return NULL;

6336

return NULL;

6342

return mem_cgroup_from_res_counter(memcg->res.parent, res);

6337

return mem_cgroup_from_res_counter(memcg->res.parent, res);

6343

}

6338

}

6344

EXPORT_SYMBOL(parent_mem_cgroup);

6339

EXPORT_SYMBOL(parent_mem_cgroup);

6345

6340

6346

static void __init mem_cgroup_soft_limit_tree_init(void)

6341

static void __init mem_cgroup_soft_limit_tree_init(void)

6347

{

6342

{

6348

struct mem_cgroup_tree_per_node *rtpn;

6343

struct mem_cgroup_tree_per_node *rtpn;

6349

struct mem_cgroup_tree_per_zone *rtpz;

6344

struct mem_cgroup_tree_per_zone *rtpz;

6350

int tmp, node, zone;

6345

int tmp, node, zone;

6351

6346

6352

for_each_node(node) {

6347

for_each_node(node) {

6353

tmp = node;

6348

tmp = node;

6354

if (!node_state(node, N_NORMAL_MEMORY))

6349

if (!node_state(node, N_NORMAL_MEMORY))

6355

tmp = -1;

6350

tmp = -1;

6356

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

6351

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

6357

BUG_ON(!rtpn);

6352

BUG_ON(!rtpn);

6358

6353

6359

soft_limit_tree.rb_tree_per_node[node] = rtpn;

6354

soft_limit_tree.rb_tree_per_node[node] = rtpn;

6360

6355

6361

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

6356

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

6362

rtpz = &rtpn->rb_tree_per_zone[zone];

6357

rtpz = &rtpn->rb_tree_per_zone[zone];

6363

rtpz->rb_root = RB_ROOT;

6358

rtpz->rb_root = RB_ROOT;

6364

spin_lock_init(&rtpz->lock);

6359

spin_lock_init(&rtpz->lock);

6365

}

6360

}

6366

}

6361

}

6367

}

6362

}

6368

6363

6369

static struct cgroup_subsys_state * __ref

6364

static struct cgroup_subsys_state * __ref

6370

mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

6365

mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

6371

{

6366

{

6372

struct mem_cgroup *memcg;

6367

struct mem_cgroup *memcg;

6373

long error = -ENOMEM;

6368

long error = -ENOMEM;

6374

int node;

6369

int node;

6375

6370

6376

memcg = mem_cgroup_alloc();

6371

memcg = mem_cgroup_alloc();

6377

if (!memcg)

6372

if (!memcg)

6378

return ERR_PTR(error);

6373

return ERR_PTR(error);

6379

6374

6380

for_each_node(node)

6375

for_each_node(node)

6381

if (alloc_mem_cgroup_per_zone_info(memcg, node))

6376

if (alloc_mem_cgroup_per_zone_info(memcg, node))

6382

goto free_out;

6377

goto free_out;

6383

6378

6384

/* root ? */

6379

/* root ? */

6385

if (parent_css == NULL) {

6380

if (parent_css == NULL) {

6386

root_mem_cgroup = memcg;

6381

root_mem_cgroup = memcg;

6387

res_counter_init(&memcg->res, NULL);

6382

res_counter_init(&memcg->res, NULL);

6388

res_counter_init(&memcg->memsw, NULL);

6383

res_counter_init(&memcg->memsw, NULL);

6389

res_counter_init(&memcg->kmem, NULL);

6384

res_counter_init(&memcg->kmem, NULL);

6390

}

6385

}

6391

6386

6392

memcg->last_scanned_node = MAX_NUMNODES;

6387

memcg->last_scanned_node = MAX_NUMNODES;

6393

INIT_LIST_HEAD(&memcg->oom_notify);

6388

INIT_LIST_HEAD(&memcg->oom_notify);

6394

memcg->move_charge_at_immigrate = 0;

6389

memcg->move_charge_at_immigrate = 0;

6395

mutex_init(&memcg->thresholds_lock);

6390

mutex_init(&memcg->thresholds_lock);

6396

spin_lock_init(&memcg->move_lock);

6391

spin_lock_init(&memcg->move_lock);

6397

vmpressure_init(&memcg->vmpressure);

6392

vmpressure_init(&memcg->vmpressure);

6398

INIT_LIST_HEAD(&memcg->event_list);

6393

INIT_LIST_HEAD(&memcg->event_list);

6399

spin_lock_init(&memcg->event_list_lock);

6394

spin_lock_init(&memcg->event_list_lock);

6400

6395

6401

return &memcg->css;

6396

return &memcg->css;

6402

6397

6403

free_out:

6398

free_out:

6404

__mem_cgroup_free(memcg);

6399

__mem_cgroup_free(memcg);

6405

return ERR_PTR(error);

6400

return ERR_PTR(error);

6406

}

6401

}

6407

6402

6408

static int

6403

static int

6409

mem_cgroup_css_online(struct cgroup_subsys_state *css)

6404

mem_cgroup_css_online(struct cgroup_subsys_state *css)

6410

{

6405

{

6411

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6406

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6412

struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));

6407

struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));

6413

6408

6414

if (css->cgroup->id > MEM_CGROUP_ID_MAX)

6409

if (css->cgroup->id > MEM_CGROUP_ID_MAX)

6415

return -ENOSPC;

6410

return -ENOSPC;

6416

6411

6417

if (!parent)

6412

if (!parent)

6418

return 0;

6413

return 0;

6419

6414

6420

mutex_lock(&memcg_create_mutex);

6415

mutex_lock(&memcg_create_mutex);

6421

6416

6422

memcg->use_hierarchy = parent->use_hierarchy;

6417

memcg->use_hierarchy = parent->use_hierarchy;

6423

memcg->oom_kill_disable = parent->oom_kill_disable;

6418

memcg->oom_kill_disable = parent->oom_kill_disable;

6424

memcg->swappiness = mem_cgroup_swappiness(parent);

6419

memcg->swappiness = mem_cgroup_swappiness(parent);

6425

6420

6426

if (parent->use_hierarchy) {

6421

if (parent->use_hierarchy) {

6427

res_counter_init(&memcg->res, &parent->res);

6422

res_counter_init(&memcg->res, &parent->res);

6428

res_counter_init(&memcg->memsw, &parent->memsw);

6423

res_counter_init(&memcg->memsw, &parent->memsw);

6429

res_counter_init(&memcg->kmem, &parent->kmem);

6424

res_counter_init(&memcg->kmem, &parent->kmem);

6430

6425

6431

/*

6426

/*

6432

* No need to take a reference to the parent because cgroup

6427

* No need to take a reference to the parent because cgroup

6433

* core guarantees its existence.

6428

* core guarantees its existence.

6434

*/

6429

*/

6435

} else {

6430

} else {

6436

res_counter_init(&memcg->res, NULL);

6431

res_counter_init(&memcg->res, NULL);

6437

res_counter_init(&memcg->memsw, NULL);

6432

res_counter_init(&memcg->memsw, NULL);

6438

res_counter_init(&memcg->kmem, NULL);

6433

res_counter_init(&memcg->kmem, NULL);

6439

/*

6434

/*

6440

* Deeper hierachy with use_hierarchy == false doesn't make

6435

* Deeper hierachy with use_hierarchy == false doesn't make

6441

* much sense so let cgroup subsystem know about this

6436

* much sense so let cgroup subsystem know about this

6442

* unfortunate state in our controller.

6437

* unfortunate state in our controller.

6443

*/

6438

*/

6444

if (parent != root_mem_cgroup)

6439

if (parent != root_mem_cgroup)

6445

memory_cgrp_subsys.broken_hierarchy = true;

6440

memory_cgrp_subsys.broken_hierarchy = true;

6446

}

6441

}

6447

mutex_unlock(&memcg_create_mutex);

6442

mutex_unlock(&memcg_create_mutex);

6448

6443

6449

return memcg_init_kmem(memcg, &memory_cgrp_subsys);

6444

return memcg_init_kmem(memcg, &memory_cgrp_subsys);

6450

}

6445

}

6451

6446

6452

/*

6447

/*

6453

* Announce all parents that a group from their hierarchy is gone.

6448

* Announce all parents that a group from their hierarchy is gone.

6454

*/

6449

*/

6455

static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)

6450

static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)

6456

{

6451

{

6457

struct mem_cgroup *parent = memcg;

6452

struct mem_cgroup *parent = memcg;

6458

6453

6459

while ((parent = parent_mem_cgroup(parent)))

6454

while ((parent = parent_mem_cgroup(parent)))

6460

mem_cgroup_iter_invalidate(parent);

6455

mem_cgroup_iter_invalidate(parent);

6461

6456

6462

/*

6457

/*

6463

* if the root memcg is not hierarchical we have to check it

6458

* if the root memcg is not hierarchical we have to check it

6464

* explicitely.

6459

* explicitely.

6465

*/

6460

*/

6466

if (!root_mem_cgroup->use_hierarchy)

6461

if (!root_mem_cgroup->use_hierarchy)

6467

mem_cgroup_iter_invalidate(root_mem_cgroup);

6462

mem_cgroup_iter_invalidate(root_mem_cgroup);

6468

}

6463

}

6469

6464

6470

static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)

6465

static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)

6471

{

6466

{

6472

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6467

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6473

struct mem_cgroup_event *event, *tmp;

6468

struct mem_cgroup_event *event, *tmp;

6474

struct cgroup_subsys_state *iter;

6469

struct cgroup_subsys_state *iter;

6475

6470

6476

/*

6471

/*

6477

* Unregister events and notify userspace.

6472

* Unregister events and notify userspace.

6478

* Notify userspace about cgroup removing only after rmdir of cgroup

6473

* Notify userspace about cgroup removing only after rmdir of cgroup

6479

* directory to avoid race between userspace and kernelspace.

6474

* directory to avoid race between userspace and kernelspace.

6480

*/

6475

*/

6481

spin_lock(&memcg->event_list_lock);

6476

spin_lock(&memcg->event_list_lock);

6482

list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {

6477

list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {

6483

list_del_init(&event->list);

6478

list_del_init(&event->list);

6484

schedule_work(&event->remove);

6479

schedule_work(&event->remove);

6485

}

6480

}

6486

spin_unlock(&memcg->event_list_lock);

6481

spin_unlock(&memcg->event_list_lock);

6487

6482

6488

kmem_cgroup_css_offline(memcg);

6483

kmem_cgroup_css_offline(memcg);

6489

6484

6490

mem_cgroup_invalidate_reclaim_iterators(memcg);

6485

mem_cgroup_invalidate_reclaim_iterators(memcg);

6491

6486

6492

/*

6487

/*

6493

* This requires that offlining is serialized. Right now that is

6488

* This requires that offlining is serialized. Right now that is

6494

* guaranteed because css_killed_work_fn() holds the cgroup_mutex.

6489

* guaranteed because css_killed_work_fn() holds the cgroup_mutex.

6495

*/

6490

*/

6496

css_for_each_descendant_post(iter, css)

6491

css_for_each_descendant_post(iter, css)

6497

mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));

6492

mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));

6498

6493

6499

mem_cgroup_destroy_all_caches(memcg);

6494

mem_cgroup_destroy_all_caches(memcg);

6500

vmpressure_cleanup(&memcg->vmpressure);

6495

vmpressure_cleanup(&memcg->vmpressure);

6501

}

6496

}

6502

6497

6503

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

6498

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

6504

{

6499

{

6505

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6500

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6506

/*

6501

/*

6507

* XXX: css_offline() would be where we should reparent all

6502

* XXX: css_offline() would be where we should reparent all

6508

* memory to prepare the cgroup for destruction. However,

6503

* memory to prepare the cgroup for destruction. However,

6509

* memcg does not do css_tryget() and res_counter charging

6504

* memcg does not do css_tryget() and res_counter charging

6510

* under the same RCU lock region, which means that charging

6505

* under the same RCU lock region, which means that charging

6511

* could race with offlining. Offlining only happens to

6506

* could race with offlining. Offlining only happens to

6512

* cgroups with no tasks in them but charges can show up

6507

* cgroups with no tasks in them but charges can show up

6513

* without any tasks from the swapin path when the target

6508

* without any tasks from the swapin path when the target

6514

* memcg is looked up from the swapout record and not from the

6509

* memcg is looked up from the swapout record and not from the

6515

* current task as it usually is. A race like this can leak

6510

* current task as it usually is. A race like this can leak

6516

* charges and put pages with stale cgroup pointers into

6511

* charges and put pages with stale cgroup pointers into

6517

* circulation:

6512

* circulation:

6518

*

6513

*

6519

* #0 #1

6514

* #0 #1

6520

* lookup_swap_cgroup_id()

6515

* lookup_swap_cgroup_id()

6521

* rcu_read_lock()

6516

* rcu_read_lock()

6522

* mem_cgroup_lookup()

6517

* mem_cgroup_lookup()

6523

* css_tryget()

6518

* css_tryget()

6524

* rcu_read_unlock()

6519

* rcu_read_unlock()

6525

* disable css_tryget()

6520

* disable css_tryget()

6526

* call_rcu()

6521

* call_rcu()

6527

* offline_css()

6522

* offline_css()

6528

* reparent_charges()

6523

* reparent_charges()

6529

* res_counter_charge()

6524

* res_counter_charge()

6530

* css_put()

6525

* css_put()

6531

* css_free()

6526

* css_free()

6532

* pc->mem_cgroup = dead memcg

6527

* pc->mem_cgroup = dead memcg

6533

* add page to lru

6528

* add page to lru

6534

*

6529

*

6535

* The bulk of the charges are still moved in offline_css() to

6530

* The bulk of the charges are still moved in offline_css() to

6536

* avoid pinning a lot of pages in case a long-term reference

6531

* avoid pinning a lot of pages in case a long-term reference

6537

* like a swapout record is deferring the css_free() to long

6532

* like a swapout record is deferring the css_free() to long

6538

* after offlining. But this makes sure we catch any charges

6533

* after offlining. But this makes sure we catch any charges

6539

* made after offlining:

6534

* made after offlining:

6540

*/

6535

*/

6541

mem_cgroup_reparent_charges(memcg);

6536

mem_cgroup_reparent_charges(memcg);

6542

6537

6543

memcg_destroy_kmem(memcg);

6538

memcg_destroy_kmem(memcg);

6544

__mem_cgroup_free(memcg);

6539

__mem_cgroup_free(memcg);

6545

}

6540

}

6546

6541

6547

#ifdef CONFIG_MMU

6542

#ifdef CONFIG_MMU

6548

/* Handlers for move charge at task migration. */

6543

/* Handlers for move charge at task migration. */

6549

#define PRECHARGE_COUNT_AT_ONCE 256

6544

#define PRECHARGE_COUNT_AT_ONCE 256

6550

static int mem_cgroup_do_precharge(unsigned long count)

6545

static int mem_cgroup_do_precharge(unsigned long count)

6551

{

6546

{

6552

int ret = 0;

6547

int ret = 0;

6553

int batch_count = PRECHARGE_COUNT_AT_ONCE;

6548

int batch_count = PRECHARGE_COUNT_AT_ONCE;

6554

struct mem_cgroup *memcg = mc.to;

6549

struct mem_cgroup *memcg = mc.to;

6555

6550

6556

if (mem_cgroup_is_root(memcg)) {

6551

if (mem_cgroup_is_root(memcg)) {

6557

mc.precharge += count;

6552

mc.precharge += count;

6558

/* we don't need css_get for root */

6553

/* we don't need css_get for root */

6559

return ret;

6554

return ret;

6560

}

6555

}

6561

/* try to charge at once */

6556

/* try to charge at once */

6562

if (count > 1) {

6557

if (count > 1) {

6563

struct res_counter *dummy;

6558

struct res_counter *dummy;

6564

/*

6559

/*

6565

* "memcg" cannot be under rmdir() because we've already checked

6560

* "memcg" cannot be under rmdir() because we've already checked

6566

* by cgroup_lock_live_cgroup() that it is not removed and we

6561

* by cgroup_lock_live_cgroup() that it is not removed and we

6567

* are still under the same cgroup_mutex. So we can postpone

6562

* are still under the same cgroup_mutex. So we can postpone

6568

* css_get().

6563

* css_get().

6569

*/

6564

*/

6570

if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))

6565

if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))

6571

goto one_by_one;

6566

goto one_by_one;

6572

if (do_swap_account && res_counter_charge(&memcg->memsw,

6567

if (do_swap_account && res_counter_charge(&memcg->memsw,

6573

PAGE_SIZE * count, &dummy)) {

6568

PAGE_SIZE * count, &dummy)) {

6574

res_counter_uncharge(&memcg->res, PAGE_SIZE * count);

6569

res_counter_uncharge(&memcg->res, PAGE_SIZE * count);

6575

goto one_by_one;

6570

goto one_by_one;

6576

}

6571

}

6577

mc.precharge += count;

6572

mc.precharge += count;

6578

return ret;

6573

return ret;

6579

}

6574

}

6580

one_by_one:

6575

one_by_one:

6581

/* fall back to one by one charge */

6576

/* fall back to one by one charge */

6582

while (count--) {

6577

while (count--) {

6583

if (signal_pending(current)) {

6578

if (signal_pending(current)) {

6584

ret = -EINTR;

6579

ret = -EINTR;

6585

break;

6580

break;

6586

}

6581

}

6587

if (!batch_count--) {

6582

if (!batch_count--) {

6588

batch_count = PRECHARGE_COUNT_AT_ONCE;

6583

batch_count = PRECHARGE_COUNT_AT_ONCE;

6589

cond_resched();

6584

cond_resched();

6590

}

6585

}

6591

ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);

6586

ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);

6592

if (ret)

6587

if (ret)

6593

/* mem_cgroup_clear_mc() will do uncharge later */

6588

/* mem_cgroup_clear_mc() will do uncharge later */

6594

return ret;

6589

return ret;

6595

mc.precharge++;

6590

mc.precharge++;

6596

}

6591

}

6597

return ret;

6592

return ret;

6598

}

6593

}

6599

6594

6600

/**

6595

/**

6601

* get_mctgt_type - get target type of moving charge

6596

* get_mctgt_type - get target type of moving charge

6602

* @vma: the vma the pte to be checked belongs

6597

* @vma: the vma the pte to be checked belongs

6603

* @addr: the address corresponding to the pte to be checked

6598

* @addr: the address corresponding to the pte to be checked

6604

* @ptent: the pte to be checked

6599

* @ptent: the pte to be checked

6605

* @target: the pointer the target page or swap ent will be stored(can be NULL)

6600

* @target: the pointer the target page or swap ent will be stored(can be NULL)

6606

*

6601

*

6607

* Returns

6602

* Returns

6608

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

6603

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

6609

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

6604

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

6610

* move charge. if @target is not NULL, the page is stored in target->page

6605

* move charge. if @target is not NULL, the page is stored in target->page

6611

* with extra refcnt got(Callers should handle it).

6606

* with extra refcnt got(Callers should handle it).

6612

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

6607

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

6613

* target for charge migration. if @target is not NULL, the entry is stored

6608

* target for charge migration. if @target is not NULL, the entry is stored

6614

* in target->ent.

6609

* in target->ent.

6615

*

6610

*

6616

* Called with pte lock held.

6611

* Called with pte lock held.

6617

*/

6612

*/

6618

union mc_target {

6613

union mc_target {

6619

struct page *page;

6614

struct page *page;

6620

swp_entry_t ent;

6615

swp_entry_t ent;

6621

};

6616

};

6622

6617

6623

enum mc_target_type {

6618

enum mc_target_type {

6624

MC_TARGET_NONE = 0,

6619

MC_TARGET_NONE = 0,

6625

MC_TARGET_PAGE,

6620

MC_TARGET_PAGE,

6626

MC_TARGET_SWAP,

6621

MC_TARGET_SWAP,

6627

};

6622

};

6628

6623

6629

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

6624

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

6630

unsigned long addr, pte_t ptent)

6625

unsigned long addr, pte_t ptent)

6631

{

6626

{

6632

struct page *page = vm_normal_page(vma, addr, ptent);

6627

struct page *page = vm_normal_page(vma, addr, ptent);

6633

6628

6634

if (!page || !page_mapped(page))

6629

if (!page || !page_mapped(page))

6635

return NULL;

6630

return NULL;

6636

if (PageAnon(page)) {

6631

if (PageAnon(page)) {

6637

/* we don't move shared anon */

6632

/* we don't move shared anon */

6638

if (!move_anon())

6633

if (!move_anon())

6639

return NULL;

6634

return NULL;

6640

} else if (!move_file())

6635

} else if (!move_file())

6641

/* we ignore mapcount for file pages */

6636

/* we ignore mapcount for file pages */

6642

return NULL;

6637

return NULL;

6643

if (!get_page_unless_zero(page))

6638

if (!get_page_unless_zero(page))

6644

return NULL;

6639

return NULL;

6645

6640

6646

return page;

6641

return page;

6647

}

6642

}

6648

6643

6649

#ifdef CONFIG_SWAP

6644

#ifdef CONFIG_SWAP

6650

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

6645

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

6651

unsigned long addr, pte_t ptent, swp_entry_t *entry)

6646

unsigned long addr, pte_t ptent, swp_entry_t *entry)

6652

{

6647

{

6653

struct page *page = NULL;

6648

struct page *page = NULL;

6654

swp_entry_t ent = pte_to_swp_entry(ptent);

6649

swp_entry_t ent = pte_to_swp_entry(ptent);

6655

6650

6656

if (!move_anon() || non_swap_entry(ent))

6651

if (!move_anon() || non_swap_entry(ent))

6657

return NULL;

6652

return NULL;

6658

/*

6653

/*

6659

* Because lookup_swap_cache() updates some statistics counter,

6654

* Because lookup_swap_cache() updates some statistics counter,

6660

* we call find_get_page() with swapper_space directly.

6655

* we call find_get_page() with swapper_space directly.

6661

*/

6656

*/

6662

page = find_get_page(swap_address_space(ent), ent.val);

6657

page = find_get_page(swap_address_space(ent), ent.val);

6663

if (do_swap_account)

6658

if (do_swap_account)

6664

entry->val = ent.val;

6659

entry->val = ent.val;

6665

6660

6666

return page;

6661

return page;

6667

}

6662

}

6668

#else

6663

#else

6669

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

6664

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

6670

unsigned long addr, pte_t ptent, swp_entry_t *entry)

6665

unsigned long addr, pte_t ptent, swp_entry_t *entry)

6671

{

6666

{

6672

return NULL;

6667

return NULL;

6673

}

6668

}

6674

#endif

6669

#endif

6675

6670

6676

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

6671

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

6677

unsigned long addr, pte_t ptent, swp_entry_t *entry)

6672

unsigned long addr, pte_t ptent, swp_entry_t *entry)

6678

{

6673

{

6679

struct page *page = NULL;

6674

struct page *page = NULL;

6680

struct address_space *mapping;

6675

struct address_space *mapping;

6681

pgoff_t pgoff;

6676

pgoff_t pgoff;

6682

6677

6683

if (!vma->vm_file) /* anonymous vma */

6678

if (!vma->vm_file) /* anonymous vma */

6684

return NULL;

6679

return NULL;

6685

if (!move_file())

6680

if (!move_file())

6686

return NULL;

6681

return NULL;

6687

6682

6688

mapping = vma->vm_file->f_mapping;

6683

mapping = vma->vm_file->f_mapping;

6689

if (pte_none(ptent))

6684

if (pte_none(ptent))

6690

pgoff = linear_page_index(vma, addr);

6685

pgoff = linear_page_index(vma, addr);

6691

else /* pte_file(ptent) is true */

6686

else /* pte_file(ptent) is true */

6692

pgoff = pte_to_pgoff(ptent);

6687

pgoff = pte_to_pgoff(ptent);

6693

6688

6694

/* page is moved even if it's not RSS of this task(page-faulted). */

6689

/* page is moved even if it's not RSS of this task(page-faulted). */

6695

page = find_get_page(mapping, pgoff);

6690

page = find_get_page(mapping, pgoff);

6696

6691

6697

#ifdef CONFIG_SWAP

6692

#ifdef CONFIG_SWAP

6698

/* shmem/tmpfs may report page out on swap: account for that too. */

6693

/* shmem/tmpfs may report page out on swap: account for that too. */

6699

if (radix_tree_exceptional_entry(page)) {

6694

if (radix_tree_exceptional_entry(page)) {

6700

swp_entry_t swap = radix_to_swp_entry(page);

6695

swp_entry_t swap = radix_to_swp_entry(page);

6701

if (do_swap_account)

6696

if (do_swap_account)

6702

*entry = swap;

6697

*entry = swap;

6703

page = find_get_page(swap_address_space(swap), swap.val);

6698

page = find_get_page(swap_address_space(swap), swap.val);

6704

}

6699

}

6705

#endif

6700

#endif

6706

return page;

6701

return page;

6707

}

6702

}

6708

6703

6709

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

6704

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

6710

unsigned long addr, pte_t ptent, union mc_target *target)

6705

unsigned long addr, pte_t ptent, union mc_target *target)

6711

{

6706

{

6712

struct page *page = NULL;

6707

struct page *page = NULL;

6713

struct page_cgroup *pc;

6708

struct page_cgroup *pc;

6714

enum mc_target_type ret = MC_TARGET_NONE;

6709

enum mc_target_type ret = MC_TARGET_NONE;

6715

swp_entry_t ent = { .val = 0 };

6710

swp_entry_t ent = { .val = 0 };

6716

6711

6717

if (pte_present(ptent))

6712

if (pte_present(ptent))

6718

page = mc_handle_present_pte(vma, addr, ptent);

6713

page = mc_handle_present_pte(vma, addr, ptent);

6719

else if (is_swap_pte(ptent))

6714

else if (is_swap_pte(ptent))

6720

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

6715

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

6721

else if (pte_none(ptent) || pte_file(ptent))

6716

else if (pte_none(ptent) || pte_file(ptent))

6722

page = mc_handle_file_pte(vma, addr, ptent, &ent);

6717

page = mc_handle_file_pte(vma, addr, ptent, &ent);

6723

6718

6724

if (!page && !ent.val)

6719

if (!page && !ent.val)

6725

return ret;

6720

return ret;

6726

if (page) {

6721

if (page) {

6727

pc = lookup_page_cgroup(page);

6722

pc = lookup_page_cgroup(page);

6728

/*

6723

/*

6729

* Do only loose check w/o page_cgroup lock.

6724

* Do only loose check w/o page_cgroup lock.

6730

* mem_cgroup_move_account() checks the pc is valid or not under

6725

* mem_cgroup_move_account() checks the pc is valid or not under

6731

* the lock.

6726

* the lock.

6732

*/

6727

*/

6733

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

6728

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

6734

ret = MC_TARGET_PAGE;

6729

ret = MC_TARGET_PAGE;

6735

if (target)

6730

if (target)

6736

target->page = page;

6731

target->page = page;

6737

}

6732

}

6738

if (!ret || !target)

6733

if (!ret || !target)

6739

put_page(page);

6734

put_page(page);

6740

}

6735

}

6741

/* There is a swap entry and a page doesn't exist or isn't charged */

6736

/* There is a swap entry and a page doesn't exist or isn't charged */

6742

if (ent.val && !ret &&

6737

if (ent.val && !ret &&

6743

mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {

6738

mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {

6744

ret = MC_TARGET_SWAP;

6739

ret = MC_TARGET_SWAP;

6745

if (target)

6740

if (target)

6746

target->ent = ent;

6741

target->ent = ent;

6747

}

6742

}

6748

return ret;

6743

return ret;

6749

}

6744

}

6750

6745

6751

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6746

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

6752

/*

6747

/*

6753

* We don't consider swapping or file mapped pages because THP does not

6748

* We don't consider swapping or file mapped pages because THP does not

6754

* support them for now.

6749

* support them for now.

6755

* Caller should make sure that pmd_trans_huge(pmd) is true.

6750

* Caller should make sure that pmd_trans_huge(pmd) is true.

6756

*/

6751

*/

6757

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

6752

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

6758

unsigned long addr, pmd_t pmd, union mc_target *target)

6753

unsigned long addr, pmd_t pmd, union mc_target *target)

6759

{

6754

{

6760

struct page *page = NULL;

6755

struct page *page = NULL;

6761

struct page_cgroup *pc;

6756

struct page_cgroup *pc;

6762

enum mc_target_type ret = MC_TARGET_NONE;

6757

enum mc_target_type ret = MC_TARGET_NONE;

6763

6758

6764

page = pmd_page(pmd);

6759

page = pmd_page(pmd);

6765

VM_BUG_ON_PAGE(!page || !PageHead(page), page);

6760

VM_BUG_ON_PAGE(!page || !PageHead(page), page);

6766

if (!move_anon())

6761

if (!move_anon())

6767

return ret;

6762

return ret;

6768

pc = lookup_page_cgroup(page);

6763

pc = lookup_page_cgroup(page);

6769

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

6764

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

6770

ret = MC_TARGET_PAGE;

6765

ret = MC_TARGET_PAGE;

6771

if (target) {

6766

if (target) {

6772

get_page(page);

6767

get_page(page);

6773

target->page = page;

6768

target->page = page;

6774

}

6769

}

6775

}

6770

}

6776

return ret;

6771

return ret;

6777

}

6772

}

6778

#else

6773

#else

6779

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

6774

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

6780

unsigned long addr, pmd_t pmd, union mc_target *target)

6775

unsigned long addr, pmd_t pmd, union mc_target *target)

6781

{

6776

{

6782

return MC_TARGET_NONE;

6777

return MC_TARGET_NONE;

6783

}

6778

}

6784

#endif

6779

#endif

6785

6780

6786

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

6781

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

6787

unsigned long addr, unsigned long end,

6782

unsigned long addr, unsigned long end,

6788

struct mm_walk *walk)

6783

struct mm_walk *walk)

6789

{

6784

{

6790

struct vm_area_struct *vma = walk->private;

6785

struct vm_area_struct *vma = walk->private;

6791

pte_t *pte;

6786

pte_t *pte;

6792

spinlock_t *ptl;

6787

spinlock_t *ptl;

6793

6788

6794

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

6789

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

6795

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

6790

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

6796

mc.precharge += HPAGE_PMD_NR;

6791

mc.precharge += HPAGE_PMD_NR;

6797

spin_unlock(ptl);

6792

spin_unlock(ptl);

6798

return 0;

6793

return 0;

6799

}

6794

}

6800

6795

6801

if (pmd_trans_unstable(pmd))

6796

if (pmd_trans_unstable(pmd))

6802

return 0;

6797

return 0;

6803

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

6798

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

6804

for (; addr != end; pte++, addr += PAGE_SIZE)

6799

for (; addr != end; pte++, addr += PAGE_SIZE)

6805

if (get_mctgt_type(vma, addr, *pte, NULL))

6800

if (get_mctgt_type(vma, addr, *pte, NULL))

6806

mc.precharge++; /* increment precharge temporarily */

6801

mc.precharge++; /* increment precharge temporarily */

6807

pte_unmap_unlock(pte - 1, ptl);

6802

pte_unmap_unlock(pte - 1, ptl);

6808

cond_resched();

6803

cond_resched();

6809

6804

6810

return 0;

6805

return 0;

6811

}

6806

}

6812

6807

6813

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

6808

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

6814

{

6809

{

6815

unsigned long precharge;

6810

unsigned long precharge;

6816

struct vm_area_struct *vma;

6811

struct vm_area_struct *vma;

6817

6812

6818

down_read(&mm->mmap_sem);

6813

down_read(&mm->mmap_sem);

6819

for (vma = mm->mmap; vma; vma = vma->vm_next) {

6814

for (vma = mm->mmap; vma; vma = vma->vm_next) {

6820

struct mm_walk mem_cgroup_count_precharge_walk = {

6815

struct mm_walk mem_cgroup_count_precharge_walk = {

6821

.pmd_entry = mem_cgroup_count_precharge_pte_range,

6816

.pmd_entry = mem_cgroup_count_precharge_pte_range,

6822

.mm = mm,

6817

.mm = mm,

6823

.private = vma,

6818

.private = vma,

6824

};

6819

};

6825

if (is_vm_hugetlb_page(vma))

6820

if (is_vm_hugetlb_page(vma))

6826

continue;

6821

continue;

6827

walk_page_range(vma->vm_start, vma->vm_end,

6822

walk_page_range(vma->vm_start, vma->vm_end,

6828

&mem_cgroup_count_precharge_walk);

6823

&mem_cgroup_count_precharge_walk);

6829

}

6824

}

6830

up_read(&mm->mmap_sem);

6825

up_read(&mm->mmap_sem);

6831

6826

6832

precharge = mc.precharge;

6827

precharge = mc.precharge;

6833

mc.precharge = 0;

6828

mc.precharge = 0;

6834

6829

6835

return precharge;

6830

return precharge;

6836

}

6831

}

6837

6832

6838

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

6833

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

6839

{

6834

{

6840

unsigned long precharge = mem_cgroup_count_precharge(mm);

6835

unsigned long precharge = mem_cgroup_count_precharge(mm);

6841

6836

6842

VM_BUG_ON(mc.moving_task);

6837

VM_BUG_ON(mc.moving_task);

6843

mc.moving_task = current;

6838

mc.moving_task = current;

6844

return mem_cgroup_do_precharge(precharge);

6839

return mem_cgroup_do_precharge(precharge);

6845

}

6840

}

6846

6841

6847

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

6842

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

6848

static void __mem_cgroup_clear_mc(void)

6843

static void __mem_cgroup_clear_mc(void)

6849

{

6844

{

6850

struct mem_cgroup *from = mc.from;

6845

struct mem_cgroup *from = mc.from;

6851

struct mem_cgroup *to = mc.to;

6846

struct mem_cgroup *to = mc.to;

6852

int i;

6847

int i;

6853

6848

6854

/* we must uncharge all the leftover precharges from mc.to */

6849

/* we must uncharge all the leftover precharges from mc.to */

6855

if (mc.precharge) {

6850

if (mc.precharge) {

6856

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

6851

__mem_cgroup_cancel_charge(mc.to, mc.precharge);

6857

mc.precharge = 0;

6852

mc.precharge = 0;

6858

}

6853

}

6859

/*

6854

/*

6860

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

6855

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

6861

* we must uncharge here.

6856

* we must uncharge here.

6862

*/

6857

*/

6863

if (mc.moved_charge) {

6858

if (mc.moved_charge) {

6864

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

6859

__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);

6865

mc.moved_charge = 0;

6860

mc.moved_charge = 0;

6866

}

6861

}

6867

/* we must fixup refcnts and charges */

6862

/* we must fixup refcnts and charges */

6868

if (mc.moved_swap) {

6863

if (mc.moved_swap) {

6869

/* uncharge swap account from the old cgroup */

6864

/* uncharge swap account from the old cgroup */

6870

if (!mem_cgroup_is_root(mc.from))

6865

if (!mem_cgroup_is_root(mc.from))

6871

res_counter_uncharge(&mc.from->memsw,

6866

res_counter_uncharge(&mc.from->memsw,

6872

PAGE_SIZE * mc.moved_swap);

6867

PAGE_SIZE * mc.moved_swap);

6873

6868

6874

for (i = 0; i < mc.moved_swap; i++)

6869

for (i = 0; i < mc.moved_swap; i++)

6875

css_put(&mc.from->css);

6870

css_put(&mc.from->css);

6876

6871

6877

if (!mem_cgroup_is_root(mc.to)) {

6872

if (!mem_cgroup_is_root(mc.to)) {

6878

/*

6873

/*

6879

* we charged both to->res and to->memsw, so we should

6874

* we charged both to->res and to->memsw, so we should

6880

* uncharge to->res.

6875

* uncharge to->res.

6881

*/

6876

*/

6882

res_counter_uncharge(&mc.to->res,

6877

res_counter_uncharge(&mc.to->res,

6883

PAGE_SIZE * mc.moved_swap);

6878

PAGE_SIZE * mc.moved_swap);

6884

}

6879

}

6885

/* we've already done css_get(mc.to) */

6880

/* we've already done css_get(mc.to) */

6886

mc.moved_swap = 0;

6881

mc.moved_swap = 0;

6887

}

6882

}

6888

memcg_oom_recover(from);

6883

memcg_oom_recover(from);

6889

memcg_oom_recover(to);

6884

memcg_oom_recover(to);

6890

wake_up_all(&mc.waitq);

6885

wake_up_all(&mc.waitq);

6891

}

6886

}

6892

6887

6893

static void mem_cgroup_clear_mc(void)

6888

static void mem_cgroup_clear_mc(void)

6894

{

6889

{

6895

struct mem_cgroup *from = mc.from;

6890

struct mem_cgroup *from = mc.from;

6896

6891

6897

/*

6892

/*

6898

* we must clear moving_task before waking up waiters at the end of

6893

* we must clear moving_task before waking up waiters at the end of

6899

* task migration.

6894

* task migration.

6900

*/

6895

*/

6901

mc.moving_task = NULL;

6896

mc.moving_task = NULL;

6902

__mem_cgroup_clear_mc();

6897

__mem_cgroup_clear_mc();

6903

spin_lock(&mc.lock);

6898

spin_lock(&mc.lock);

6904

mc.from = NULL;

6899

mc.from = NULL;

6905

mc.to = NULL;

6900

mc.to = NULL;

6906

spin_unlock(&mc.lock);

6901

spin_unlock(&mc.lock);

6907

mem_cgroup_end_move(from);

6902

mem_cgroup_end_move(from);

6908

}

6903

}

6909

6904

6910

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

6905

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

6911

struct cgroup_taskset *tset)

6906

struct cgroup_taskset *tset)

6912

{

6907

{

6913

struct task_struct *p = cgroup_taskset_first(tset);

6908

struct task_struct *p = cgroup_taskset_first(tset);

6914

int ret = 0;

6909

int ret = 0;

6915

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6910

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

6916

unsigned long move_charge_at_immigrate;

6911

unsigned long move_charge_at_immigrate;

6917

6912

6918

/*

6913

/*

6919

* We are now commited to this value whatever it is. Changes in this

6914

* We are now commited to this value whatever it is. Changes in this

6920

* tunable will only affect upcoming migrations, not the current one.

6915

* tunable will only affect upcoming migrations, not the current one.

6921

* So we need to save it, and keep it going.

6916

* So we need to save it, and keep it going.

6922

*/

6917

*/

6923

move_charge_at_immigrate = memcg->move_charge_at_immigrate;

6918

move_charge_at_immigrate = memcg->move_charge_at_immigrate;

6924

if (move_charge_at_immigrate) {

6919

if (move_charge_at_immigrate) {

6925

struct mm_struct *mm;

6920

struct mm_struct *mm;

6926

struct mem_cgroup *from = mem_cgroup_from_task(p);

6921

struct mem_cgroup *from = mem_cgroup_from_task(p);

6927

6922

6928

VM_BUG_ON(from == memcg);

6923

VM_BUG_ON(from == memcg);

6929

6924

6930

mm = get_task_mm(p);

6925

mm = get_task_mm(p);

6931

if (!mm)

6926

if (!mm)

6932

return 0;

6927

return 0;

6933

/* We move charges only when we move a owner of the mm */

6928

/* We move charges only when we move a owner of the mm */

6934

if (mm->owner == p) {

6929

if (mm->owner == p) {

6935

VM_BUG_ON(mc.from);

6930

VM_BUG_ON(mc.from);

6936

VM_BUG_ON(mc.to);

6931

VM_BUG_ON(mc.to);

6937

VM_BUG_ON(mc.precharge);

6932

VM_BUG_ON(mc.precharge);

6938

VM_BUG_ON(mc.moved_charge);

6933

VM_BUG_ON(mc.moved_charge);

6939

VM_BUG_ON(mc.moved_swap);

6934

VM_BUG_ON(mc.moved_swap);

6940

mem_cgroup_start_move(from);

6935

mem_cgroup_start_move(from);

6941

spin_lock(&mc.lock);

6936

spin_lock(&mc.lock);

6942

mc.from = from;

6937

mc.from = from;

6943

mc.to = memcg;

6938

mc.to = memcg;

6944

mc.immigrate_flags = move_charge_at_immigrate;

6939

mc.immigrate_flags = move_charge_at_immigrate;

6945

spin_unlock(&mc.lock);

6940

spin_unlock(&mc.lock);

6946

/* We set mc.moving_task later */

6941

/* We set mc.moving_task later */

6947

6942

6948

ret = mem_cgroup_precharge_mc(mm);

6943

ret = mem_cgroup_precharge_mc(mm);

6949

if (ret)

6944

if (ret)

6950

mem_cgroup_clear_mc();

6945

mem_cgroup_clear_mc();

6951

}

6946

}

6952

mmput(mm);

6947

mmput(mm);

6953

}

6948

}

6954

return ret;

6949

return ret;

6955

}

6950

}

6956

6951

6957

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

6952

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

6958

struct cgroup_taskset *tset)

6953

struct cgroup_taskset *tset)

6959

{

6954

{

6960

mem_cgroup_clear_mc();

6955

mem_cgroup_clear_mc();

6961

}

6956

}

6962

6957

6963

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

6958

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

6964

unsigned long addr, unsigned long end,

6959

unsigned long addr, unsigned long end,

6965

struct mm_walk *walk)

6960

struct mm_walk *walk)

6966

{

6961

{

6967

int ret = 0;

6962

int ret = 0;

6968

struct vm_area_struct *vma = walk->private;

6963

struct vm_area_struct *vma = walk->private;

6969

pte_t *pte;

6964

pte_t *pte;

6970

spinlock_t *ptl;

6965

spinlock_t *ptl;

6971

enum mc_target_type target_type;

6966

enum mc_target_type target_type;

6972

union mc_target target;

6967

union mc_target target;

6973

struct page *page;

6968

struct page *page;

6974

struct page_cgroup *pc;

6969

struct page_cgroup *pc;

6975

6970

6976

/*

6971

/*

6977

* We don't take compound_lock() here but no race with splitting thp

6972

* We don't take compound_lock() here but no race with splitting thp

6978

* happens because:

6973

* happens because:

6979

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

6974

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

6980

* under splitting, which means there's no concurrent thp split,

6975

* under splitting, which means there's no concurrent thp split,

6981

* - if another thread runs into split_huge_page() just after we

6976

* - if another thread runs into split_huge_page() just after we

6982

* entered this if-block, the thread must wait for page table lock

6977

* entered this if-block, the thread must wait for page table lock

6983

* to be unlocked in __split_huge_page_splitting(), where the main

6978

* to be unlocked in __split_huge_page_splitting(), where the main

6984

* part of thp split is not executed yet.

6979

* part of thp split is not executed yet.

6985

*/

6980

*/

6986

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

6981

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

6987

if (mc.precharge < HPAGE_PMD_NR) {

6982

if (mc.precharge < HPAGE_PMD_NR) {

6988

spin_unlock(ptl);

6983

spin_unlock(ptl);

6989

return 0;

6984

return 0;

6990

}

6985

}

6991

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

6986

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

6992

if (target_type == MC_TARGET_PAGE) {

6987

if (target_type == MC_TARGET_PAGE) {

6993

page = target.page;

6988

page = target.page;

6994

if (!isolate_lru_page(page)) {

6989

if (!isolate_lru_page(page)) {

6995

pc = lookup_page_cgroup(page);

6990

pc = lookup_page_cgroup(page);

6996

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

6991

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

6997

pc, mc.from, mc.to)) {

6992

pc, mc.from, mc.to)) {

6998

mc.precharge -= HPAGE_PMD_NR;

6993

mc.precharge -= HPAGE_PMD_NR;

6999

mc.moved_charge += HPAGE_PMD_NR;

6994

mc.moved_charge += HPAGE_PMD_NR;

7000

}

6995

}

7001

putback_lru_page(page);

6996

putback_lru_page(page);

7002

}

6997

}

7003

put_page(page);

6998

put_page(page);

7004

}

6999

}

7005

spin_unlock(ptl);

7000

spin_unlock(ptl);

7006

return 0;

7001

return 0;

7007

}

7002

}

7008

7003

7009

if (pmd_trans_unstable(pmd))

7004

if (pmd_trans_unstable(pmd))

7010

return 0;

7005

return 0;

7011

retry:

7006

retry:

7012

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

7007

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

7013

for (; addr != end; addr += PAGE_SIZE) {

7008

for (; addr != end; addr += PAGE_SIZE) {

7014

pte_t ptent = *(pte++);

7009

pte_t ptent = *(pte++);

7015

swp_entry_t ent;

7010

swp_entry_t ent;

7016

7011

7017

if (!mc.precharge)

7012

if (!mc.precharge)

7018

break;

7013

break;

7019

7014

7020

switch (get_mctgt_type(vma, addr, ptent, &target)) {

7015

switch (get_mctgt_type(vma, addr, ptent, &target)) {

7021

case MC_TARGET_PAGE:

7016

case MC_TARGET_PAGE:

7022

page = target.page;

7017

page = target.page;

7023

if (isolate_lru_page(page))

7018

if (isolate_lru_page(page))

7024

goto put;

7019

goto put;

7025

pc = lookup_page_cgroup(page);

7020

pc = lookup_page_cgroup(page);

7026

if (!mem_cgroup_move_account(page, 1, pc,

7021

if (!mem_cgroup_move_account(page, 1, pc,

7027

mc.from, mc.to)) {

7022

mc.from, mc.to)) {

7028

mc.precharge--;

7023

mc.precharge--;

7029

/* we uncharge from mc.from later. */

7024

/* we uncharge from mc.from later. */

7030

mc.moved_charge++;

7025

mc.moved_charge++;

7031

}

7026

}

7032

putback_lru_page(page);

7027

putback_lru_page(page);

7033

put: /* get_mctgt_type() gets the page */

7028

put: /* get_mctgt_type() gets the page */

7034

put_page(page);

7029

put_page(page);

7035

break;

7030

break;

7036

case MC_TARGET_SWAP:

7031

case MC_TARGET_SWAP:

7037

ent = target.ent;

7032

ent = target.ent;

7038

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

7033

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

7039

mc.precharge--;

7034

mc.precharge--;

7040

/* we fixup refcnts and charges later. */

7035

/* we fixup refcnts and charges later. */

7041

mc.moved_swap++;

7036

mc.moved_swap++;

7042

}

7037

}

7043

break;

7038

break;

7044

default:

7039

default:

7045

break;

7040

break;

7046

}

7041

}

7047

}

7042

}

7048

pte_unmap_unlock(pte - 1, ptl);

7043

pte_unmap_unlock(pte - 1, ptl);

7049

cond_resched();

7044

cond_resched();

7050

7045

7051

if (addr != end) {

7046

if (addr != end) {

7052

/*

7047

/*

7053

* We have consumed all precharges we got in can_attach().

7048

* We have consumed all precharges we got in can_attach().

7054

* We try charge one by one, but don't do any additional

7049

* We try charge one by one, but don't do any additional

7055

* charges to mc.to if we have failed in charge once in attach()

7050

* charges to mc.to if we have failed in charge once in attach()

7056

* phase.

7051

* phase.

7057

*/

7052

*/

7058

ret = mem_cgroup_do_precharge(1);

7053

ret = mem_cgroup_do_precharge(1);

7059

if (!ret)

7054

if (!ret)

7060

goto retry;

7055

goto retry;

7061

}

7056

}

7062

7057

7063

return ret;

7058

return ret;

7064

}

7059

}

7065

7060

7066

static void mem_cgroup_move_charge(struct mm_struct *mm)

7061

static void mem_cgroup_move_charge(struct mm_struct *mm)

7067

{

7062

{

7068

struct vm_area_struct *vma;

7063

struct vm_area_struct *vma;

7069

7064

7070

lru_add_drain_all();

7065

lru_add_drain_all();

7071

retry:

7066

retry:

7072

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

7067

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

7073

/*

7068

/*

7074

* Someone who are holding the mmap_sem might be waiting in

7069

* Someone who are holding the mmap_sem might be waiting in

7075

* waitq. So we cancel all extra charges, wake up all waiters,

7070

* waitq. So we cancel all extra charges, wake up all waiters,

7076

* and retry. Because we cancel precharges, we might not be able

7071

* and retry. Because we cancel precharges, we might not be able

7077

* to move enough charges, but moving charge is a best-effort

7072

* to move enough charges, but moving charge is a best-effort

7078

* feature anyway, so it wouldn't be a big problem.

7073

* feature anyway, so it wouldn't be a big problem.

7079

*/

7074

*/

7080

__mem_cgroup_clear_mc();

7075

__mem_cgroup_clear_mc();

7081

cond_resched();

7076

cond_resched();

7082

goto retry;

7077

goto retry;

7083

}

7078

}

7084

for (vma = mm->mmap; vma; vma = vma->vm_next) {

7079

for (vma = mm->mmap; vma; vma = vma->vm_next) {

7085

int ret;

7080

int ret;

7086

struct mm_walk mem_cgroup_move_charge_walk = {

7081

struct mm_walk mem_cgroup_move_charge_walk = {

7087

.pmd_entry = mem_cgroup_move_charge_pte_range,

7082

.pmd_entry = mem_cgroup_move_charge_pte_range,

7088

.mm = mm,

7083

.mm = mm,

7089

.private = vma,

7084

.private = vma,

7090

};

7085

};

7091

if (is_vm_hugetlb_page(vma))

7086

if (is_vm_hugetlb_page(vma))

7092

continue;

7087

continue;

7093

ret = walk_page_range(vma->vm_start, vma->vm_end,

7088

ret = walk_page_range(vma->vm_start, vma->vm_end,

7094

&mem_cgroup_move_charge_walk);

7089

&mem_cgroup_move_charge_walk);

7095

if (ret)

7090

if (ret)

7096

/*

7091

/*

7097

* means we have consumed all precharges and failed in

7092

* means we have consumed all precharges and failed in

7098

* doing additional charge. Just abandon here.

7093

* doing additional charge. Just abandon here.

7099

*/

7094

*/

7100

break;

7095

break;

7101

}

7096

}

7102

up_read(&mm->mmap_sem);

7097

up_read(&mm->mmap_sem);

7103

}

7098

}

7104

7099

7105

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

7100

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

7106

struct cgroup_taskset *tset)

7101

struct cgroup_taskset *tset)

7107

{

7102

{

7108

struct task_struct *p = cgroup_taskset_first(tset);

7103

struct task_struct *p = cgroup_taskset_first(tset);

7109

struct mm_struct *mm = get_task_mm(p);

7104

struct mm_struct *mm = get_task_mm(p);

7110

7105

7111

if (mm) {

7106

if (mm) {

7112

if (mc.to)

7107

if (mc.to)

7113

mem_cgroup_move_charge(mm);

7108

mem_cgroup_move_charge(mm);

7114

mmput(mm);

7109

mmput(mm);

7115

}

7110

}

7116

if (mc.to)

7111

if (mc.to)

7117

mem_cgroup_clear_mc();

7112

mem_cgroup_clear_mc();

7118

}

7113

}

7119

#else /* !CONFIG_MMU */

7114

#else /* !CONFIG_MMU */

7120

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

7115

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

7121

struct cgroup_taskset *tset)

7116

struct cgroup_taskset *tset)

7122

{

7117

{

7123

return 0;

7118

return 0;

7124

}

7119

}

7125

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

7120

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

7126

struct cgroup_taskset *tset)

7121

struct cgroup_taskset *tset)

7127

{

7122

{

7128

}

7123

}

7129

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

7124

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

7130

struct cgroup_taskset *tset)

7125

struct cgroup_taskset *tset)

7131

{

7126

{

7132

}

7127

}

7133

#endif

7128

#endif

7134

7129

7135

/*

7130

/*

7136

* Cgroup retains root cgroups across [un]mount cycles making it necessary

7131

* Cgroup retains root cgroups across [un]mount cycles making it necessary

7137

* to verify sane_behavior flag on each mount attempt.

7132

* to verify sane_behavior flag on each mount attempt.

7138

*/

7133

*/

7139

static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)

7134

static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)

7140

{

7135

{

7141

/*

7136

/*

7142

* use_hierarchy is forced with sane_behavior. cgroup core

7137

* use_hierarchy is forced with sane_behavior. cgroup core

7143

* guarantees that @root doesn't have any children, so turning it

7138

* guarantees that @root doesn't have any children, so turning it

7144

* on for the root memcg is enough.

7139

* on for the root memcg is enough.

7145

*/

7140

*/

7146

if (cgroup_sane_behavior(root_css->cgroup))

7141

if (cgroup_sane_behavior(root_css->cgroup))

7147

mem_cgroup_from_css(root_css)->use_hierarchy = true;

7142

mem_cgroup_from_css(root_css)->use_hierarchy = true;

7148

}

7143

}

7149

7144

7150

struct cgroup_subsys memory_cgrp_subsys = {

7145

struct cgroup_subsys memory_cgrp_subsys = {

7151

.css_alloc = mem_cgroup_css_alloc,

7146

.css_alloc = mem_cgroup_css_alloc,

7152

.css_online = mem_cgroup_css_online,

7147

.css_online = mem_cgroup_css_online,

7153

.css_offline = mem_cgroup_css_offline,

7148

.css_offline = mem_cgroup_css_offline,

7154

.css_free = mem_cgroup_css_free,

7149

.css_free = mem_cgroup_css_free,

7155

.can_attach = mem_cgroup_can_attach,

7150

.can_attach = mem_cgroup_can_attach,

7156

.cancel_attach = mem_cgroup_cancel_attach,

7151

.cancel_attach = mem_cgroup_cancel_attach,

7157

.attach = mem_cgroup_move_task,

7152

.attach = mem_cgroup_move_task,

7158

.bind = mem_cgroup_bind,

7153

.bind = mem_cgroup_bind,

7159

.base_cftypes = mem_cgroup_files,

7154

.base_cftypes = mem_cgroup_files,

7160

.early_init = 0,

7155

.early_init = 0,

7161

};

7156

};

7162

7157

7163

#ifdef CONFIG_MEMCG_SWAP

7158

#ifdef CONFIG_MEMCG_SWAP

7164

static int __init enable_swap_account(char *s)

7159

static int __init enable_swap_account(char *s)

7165

{

7160

{

7166

if (!strcmp(s, "1"))

7161

if (!strcmp(s, "1"))

7167

really_do_swap_account = 1;

7162

really_do_swap_account = 1;

7168

else if (!strcmp(s, "0"))

7163

else if (!strcmp(s, "0"))

7169

really_do_swap_account = 0;

7164

really_do_swap_account = 0;

7170

return 1;

7165

return 1;

7171

}

7166

}

7172

__setup("swapaccount=", enable_swap_account);

7167

__setup("swapaccount=", enable_swap_account);

7173

7168

7174

static void __init memsw_file_init(void)

7169

static void __init memsw_file_init(void)

7175

{

7170

{

7176

WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));

7171

WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));

7177

}

7172

}

7178

7173

7179

static void __init enable_swap_cgroup(void)

7174

static void __init enable_swap_cgroup(void)

7180

{

7175

{

7181

if (!mem_cgroup_disabled() && really_do_swap_account) {

7176

if (!mem_cgroup_disabled() && really_do_swap_account) {

7182

do_swap_account = 1;

7177

do_swap_account = 1;

7183

memsw_file_init();

7178

memsw_file_init();

7184

}

7179

}

7185

}

7180

}

7186

7181

7187

#else

7182

#else

7188

static void __init enable_swap_cgroup(void)

7183

static void __init enable_swap_cgroup(void)

7189

{

7184

{

7190

}

7185

}

7191

#endif

7186

#endif

7192

7187

7193

/*

7188

/*

7194

* subsys_initcall() for memory controller.

7189

* subsys_initcall() for memory controller.

7195

*

7190

*

7196

* Some parts like hotcpu_notifier() have to be initialized from this context

7191

* Some parts like hotcpu_notifier() have to be initialized from this context

7197

* because of lock dependencies (cgroup_lock -> cpu hotplug) but basically

7192

* because of lock dependencies (cgroup_lock -> cpu hotplug) but basically

7198

* everything that doesn't depend on a specific mem_cgroup structure should

7193

* everything that doesn't depend on a specific mem_cgroup structure should

7199

* be initialized from here.

7194

* be initialized from here.

7200

*/

7195

*/

7201

static int __init mem_cgroup_init(void)

7196

static int __init mem_cgroup_init(void)

7202

{

7197

{

7203

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

7198

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

7204

enable_swap_cgroup();

7199

enable_swap_cgroup();

7205

mem_cgroup_soft_limit_tree_init();

7200

mem_cgroup_soft_limit_tree_init();

7206

memcg_stock_init();

7201

memcg_stock_init();

7207

return 0;

7202

return 0;

7208

}

7203

}

7209

subsys_initcall(mem_cgroup_init);

7204

subsys_initcall(mem_cgroup_init);

7210

7205

GITLAB

memcg, slab: separate memcg vs root cache creation paths

 /* memcontrol.h - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #ifndef _LINUX_MEMCONTROL_H
 #define _LINUX_MEMCONTROL_H
 #include <linux/cgroup.h>
 #include <linux/vm_event_item.h>
 #include <linux/hardirq.h>
 #include <linux/jump_label.h>
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
 /*
  * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
  * These two lists should keep in accord with each other.
  */
 enum mem_cgroup_stat_index {
 	/*
 	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
 	 */
 	MEM_CGROUP_STAT_CACHE,		/* # of pages charged as cache */
 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
 	MEM_CGROUP_STAT_WRITEBACK,	/* # of pages under writeback */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 };
 struct mem_cgroup_reclaim_cookie {
 	struct zone *zone;
 	int priority;
 	unsigned int generation;
 };
 #ifdef CONFIG_MEMCG
 /*
  * All "charge" functions with gfp_mask should use GFP_KERNEL or
  * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
  * alloc memory but reclaims memory from all available zones. So, "where I want
  * memory from" bits of gfp_mask has no meaning. So any bits of that field is
  * available but adding a rule is better. charge functions' gfp_mask should
  * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
  * codes.
  * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
  */
 extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask);
 /* for swap handling */
 extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 		struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
 extern void mem_cgroup_commit_charge_swapin(struct page *page,
 					struct mem_cgroup *memcg);
 extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
 extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
 					gfp_t gfp_mask);
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 /* For coalescing uncharge for reducing memcg' overhead*/
 extern void mem_cgroup_uncharge_start(void);
 extern void mem_cgroup_uncharge_end(void);
 extern void mem_cgroup_uncharge_page(struct page *page);
 extern void mem_cgroup_uncharge_cache_page(struct page *page);
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg);
 bool task_in_mem_cgroup(struct task_struct *task,
 			const struct mem_cgroup *memcg);
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
 static inline
 bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *task_memcg;
 	bool match;
 	rcu_read_lock();
 	task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	match = __mem_cgroup_same_or_subtree(memcg, task_memcg);
 	rcu_read_unlock();
 	return match;
 }
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
 extern void
 mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 			     struct mem_cgroup **memcgp);
 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok);
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
 				   struct mem_cgroup_reclaim_cookie *);
 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
 /*
  * For memory reclaim.
  */
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
 					struct page *newpage);
 static inline void mem_cgroup_oom_enable(void)
 {
 	WARN_ON(current->memcg_oom.may_oom);
 	current->memcg_oom.may_oom = 1;
 }
 static inline void mem_cgroup_oom_disable(void)
 {
 	WARN_ON(!current->memcg_oom.may_oom);
 	current->memcg_oom.may_oom = 0;
 }
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return p->memcg_oom.memcg;
 }
 bool mem_cgroup_oom_synchronize(bool wait);
 #ifdef CONFIG_MEMCG_SWAP
 extern int do_swap_account;
 #endif
 static inline bool mem_cgroup_disabled(void)
 {
 	if (memory_cgrp_subsys.disabled)
 		return true;
 	return false;
 }
 void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
 					 unsigned long *flags);
 extern atomic_t memcg_moving;
 static inline void mem_cgroup_begin_update_page_stat(struct page *page,
 					bool *locked, unsigned long *flags)
 {
 	if (mem_cgroup_disabled())
 		return;
 	rcu_read_lock();
 	*locked = false;
 	if (atomic_read(&memcg_moving))
 		__mem_cgroup_begin_update_page_stat(page, locked, flags);
 }
 void __mem_cgroup_end_update_page_stat(struct page *page,
 				unsigned long *flags);
 static inline void mem_cgroup_end_update_page_stat(struct page *page,
 					bool *locked, unsigned long *flags)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (*locked)
 		__mem_cgroup_end_update_page_stat(page, flags);
 	rcu_read_unlock();
 }
 void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_stat_index idx,
 				 int val);
 static inline void mem_cgroup_inc_page_stat(struct page *page,
 					    enum mem_cgroup_stat_index idx)
 {
 	mem_cgroup_update_page_stat(page, idx, 1);
 }
 static inline void mem_cgroup_dec_page_stat(struct page *page,
 					    enum mem_cgroup_stat_index idx)
 {
 	mem_cgroup_update_page_stat(page, idx, -1);
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						gfp_t gfp_mask,
 						unsigned long *total_scanned);
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
 					     enum vm_event_item idx)
 {
 	if (mem_cgroup_disabled())
 		return;
 	__mem_cgroup_count_vm_event(mm, idx);
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void mem_cgroup_split_huge_fixup(struct page *head);
 #endif
 #ifdef CONFIG_DEBUG_VM
 bool mem_cgroup_bad_page_check(struct page *page);
 void mem_cgroup_print_bad_page(struct page *page);
 #endif
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 static inline int mem_cgroup_charge_anon(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
 {
 	return 0;
 }
 static inline int mem_cgroup_charge_file(struct page *page,
 					struct mm_struct *mm, gfp_t gfp_mask)
 {
 	return 0;
 }
 static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 		struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
 {
 	return 0;
 }
 static inline void mem_cgroup_commit_charge_swapin(struct page *page,
 					  struct mem_cgroup *memcg)
 {
 }
 static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
 {
 }
 static inline void mem_cgroup_uncharge_start(void)
 {
 }
 static inline void mem_cgroup_uncharge_end(void)
 {
 }
 static inline void mem_cgroup_uncharge_page(struct page *page)
 {
 }
 static inline void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 }
 static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 						    struct mem_cgroup *memcg)
 {
 	return &zone->lruvec;
 }
 static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
 						    struct zone *zone)
 {
 	return &zone->lruvec;
 }
 static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	return NULL;
 }
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 		struct mem_cgroup *memcg)
 {
 	return true;
 }
 static inline bool task_in_mem_cgroup(struct task_struct *task,
 				      const struct mem_cgroup *memcg)
 {
 	return true;
 }
 static inline struct cgroup_subsys_state
 		*mem_cgroup_css(struct mem_cgroup *memcg)
 {
 	return NULL;
 }
 static inline void
 mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 			     struct mem_cgroup **memcgp)
 {
 }
 static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 		struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 }
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
 		struct mem_cgroup_reclaim_cookie *reclaim)
 {
 	return NULL;
 }
 static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
 					 struct mem_cgroup *prev)
 {
 }
 static inline bool mem_cgroup_disabled(void)
 {
 	return true;
 }
 static inline int
 mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	return 1;
 }
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	return 0;
 }
 static inline void
 mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 			      int increment)
 {
 }
 static inline void
 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 }
 static inline void mem_cgroup_begin_update_page_stat(struct page *page,
 					bool *locked, unsigned long *flags)
 {
 }
 static inline void mem_cgroup_end_update_page_stat(struct page *page,
 					bool *locked, unsigned long *flags)
 {
 }
 static inline void mem_cgroup_oom_enable(void)
 {
 }
 static inline void mem_cgroup_oom_disable(void)
 {
 }
 static inline bool task_in_memcg_oom(struct task_struct *p)
 {
 	return false;
 }
 static inline bool mem_cgroup_oom_synchronize(bool wait)
 {
 	return false;
 }
 static inline void mem_cgroup_inc_page_stat(struct page *page,
 					    enum mem_cgroup_stat_index idx)
 {
 }
 static inline void mem_cgroup_dec_page_stat(struct page *page,
 					    enum mem_cgroup_stat_index idx)
 {
 }
 static inline
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
 {
 	return 0;
 }
 static inline void mem_cgroup_split_huge_fixup(struct page *head)
 {
 }
 static inline
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
 static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
 				struct page *newpage)
 {
 }
 #endif /* CONFIG_MEMCG */
 #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
 static inline bool
 mem_cgroup_bad_page_check(struct page *page)
 {
 	return false;
 }
 static inline void
 mem_cgroup_print_bad_page(struct page *page)
 {
 }
 #endif
 enum {
 	UNDER_LIMIT,
 	SOFT_LIMIT,
 	OVER_LIMIT,
 };
 struct sock;
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 void sock_update_memcg(struct sock *sk);
 void sock_release_memcg(struct sock *sk);
 #else
 static inline void sock_update_memcg(struct sock *sk)
 {
 }
 static inline void sock_release_memcg(struct sock *sk)
 {
 }
 #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_MEMCG_KMEM
 extern struct static_key memcg_kmem_enabled_key;
 extern int memcg_limited_groups_array_size;
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
  * check if the cache is valid (it is either valid or NULL).
  * the slab_mutex must be held when looping through those caches
  */
 #define for_each_memcg_cache_index(_idx)	\
 	for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
 static inline bool memcg_kmem_enabled(void)
 {
 	return static_key_false(&memcg_kmem_enabled_key);
 }
 /*
  * In general, we'll do everything in our power to not incur in any overhead
  * for non-memcg users for the kmem functions. Not even a function call, if we
  * can avoid it.
  *
  * Therefore, we'll inline all those functions so that in the best case, we'll
  * see that kmemcg is off for everybody and proceed quickly.  If it is on,
  * we'll still do most of the flag checking inline. We check a lot of
  * conditions, but because they are pretty simple, they are expected to be
  * fast.
  */
 bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
 					int order);
 void __memcg_kmem_commit_charge(struct page *page,
 				       struct mem_cgroup *memcg, int order);
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 int memcg_cache_id(struct mem_cgroup *memcg);
 char *memcg_create_cache_name(struct mem_cgroup *memcg,
 			      struct kmem_cache *root_cache);
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache);
 void memcg_free_cache_params(struct kmem_cache *s);
 void memcg_register_cache(struct kmem_cache *s);
 void memcg_unregister_cache(struct kmem_cache *s);
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
 void memcg_update_array_size(int num_groups);
 struct kmem_cache *
 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
 void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
 /**
  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
  * @gfp: the gfp allocation flags.
  * @memcg: a pointer to the memcg this was charged against.
  * @order: allocation order.
  *
  * returns true if the memcg where the current task belongs can hold this
  * allocation.
  *
  * We return true automatically if this allocation is not to be accounted to
  * any memcg.
  */
 static inline bool
 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 {
 	if (!memcg_kmem_enabled())
 		return true;
 	/*
 	 * __GFP_NOFAIL allocations will move on even if charging is not
 	 * possible. Therefore we don't even try, and have this allocation
 	 * unaccounted. We could in theory charge it with
 	 * res_counter_charge_nofail, but we hope those allocations are rare,
 	 * and won't be worth the trouble.
 	 */
 	if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
 		return true;
 	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
 		return true;
 	/* If the test is dying, just let it go. */
 	if (unlikely(fatal_signal_pending(current)))
 		return true;
 	return __memcg_kmem_newpage_charge(gfp, memcg, order);
 }
 /**
  * memcg_kmem_uncharge_pages: uncharge pages from memcg
  * @page: pointer to struct page being freed
  * @order: allocation order.
  *
  * there is no need to specify memcg here, since it is embedded in page_cgroup
  */
 static inline void
 memcg_kmem_uncharge_pages(struct page *page, int order)
 {
 	if (memcg_kmem_enabled())
 		__memcg_kmem_uncharge_pages(page, order);
 }
 /**
  * memcg_kmem_commit_charge: embeds correct memcg in a page
  * @page: pointer to struct page recently allocated
  * @memcg: the memcg structure we charged against
  * @order: allocation order.
  *
  * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
  * failure of the allocation. if @page is NULL, this function will revert the
  * charges. Otherwise, it will commit the memcg given by @memcg to the
  * corresponding page_cgroup.
  */
 static inline void
 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
 {
 	if (memcg_kmem_enabled() && memcg)
 		__memcg_kmem_commit_charge(page, memcg, order);
 }
 /**
  * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
  * @cachep: the original global kmem cache
  * @gfp: allocation flags.
  *
  * This function assumes that the task allocating, which determines the memcg
  * in the page allocator, belongs to the same cgroup throughout the whole
  * process.  Misacounting can happen if the task calls memcg_kmem_get_cache()
  * while belonging to a cgroup, and later on changes. This is considered
  * acceptable, and should only happen upon task migration.
  *
  * Before the cache is created by the memcg core, there is also a possible
  * imbalance: the task belongs to a memcg, but the cache being allocated from
  * is the global cache, since the child cache is not yet guaranteed to be
  * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
  * passed and the page allocator will not attempt any cgroup accounting.
  */
 static __always_inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	if (!memcg_kmem_enabled())
 		return cachep;
 	if (gfp & __GFP_NOFAIL)
 		return cachep;
 	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
 		return cachep;
 	if (unlikely(fatal_signal_pending(current)))
 		return cachep;
 	return __memcg_kmem_get_cache(cachep, gfp);
 }
 #else
 #define for_each_memcg_cache_index(_idx)	\
 	for (; NULL; )
 static inline bool memcg_kmem_enabled(void)
 {
 	return false;
 }
 static inline bool
 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 {
 	return true;
 }
 static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
 {
 }
 static inline void
 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
 {
 }
 static inline int memcg_cache_id(struct mem_cgroup *memcg)
 {
 	return -1;
 }
-static inline char *memcg_create_cache_name(struct mem_cgroup *memcg,
-					    struct kmem_cache *root_cache)
-{
-	return NULL;
-}
 static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
 		struct kmem_cache *s, struct kmem_cache *root_cache)
 {
 	return 0;
 }
 static inline void memcg_free_cache_params(struct kmem_cache *s)
 {
 }
 static inline void memcg_register_cache(struct kmem_cache *s)
 {
 }
 static inline void memcg_unregister_cache(struct kmem_cache *s)
 {
 }
 static inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	return cachep;
 }
 static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * Memory thresholds
  * Copyright (C) 2009 Nokia Corporation
  * Author: Kirill A. Shutemov
  *
  * Kernel Memory Controller
  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  * Authors: Glauber Costa and Suleiman Souhlal
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/res_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/tcp_memcontrol.h>
 #include "slab.h"
 #include <asm/uaccess.h>
 #include <trace/events/vmscan.h>
 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
 EXPORT_SYMBOL(memory_cgrp_subsys);
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
 #ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
 #ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
 #endif
 #else
 #define do_swap_account		0
 #endif
 static const char * const mem_cgroup_stat_names[] = {
 	"cache",
 	"rss",
 	"rss_huge",
 	"mapped_file",
 	"writeback",
 	"swap",
 };
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
 	"pgfault",
 	"pgmajfault",
 };
 static const char * const mem_cgroup_lru_names[] = {
 	"inactive_anon",
 	"active_anon",
 	"inactive_file",
 	"active_file",
 	"unevictable",
 };
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
  * for trigger some periodic events. This is straightforward and better
  * than using jiffies etc. to handle periodic memcg event.
  */
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
 	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET	1024
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
 	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 struct mem_cgroup_reclaim_iter {
 	/*
 	 * last scanned hierarchy member. Valid only if last_dead_count
 	 * matches memcg->dead_count of the hierarchy root group.
 	 */
 	struct mem_cgroup *last_visited;
 	int last_dead_count;
 	/* scan generation, increased every round-trip */
 	unsigned int generation;
 };
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long long	usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
  */
 struct mem_cgroup_tree_per_zone {
 	struct rb_root rb_root;
 	spinlock_t lock;
 };
 struct mem_cgroup_tree_per_node {
 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 };
 struct mem_cgroup_tree {
 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 };
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
 	struct eventfd_ctx *eventfd;
 	u64 threshold;
 };
 /* For threshold */
 struct mem_cgroup_threshold_ary {
 	/* An array index points to threshold just below or equal to usage. */
 	int current_threshold;
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
 	struct mem_cgroup_threshold entries[0];
 };
 struct mem_cgroup_thresholds {
 	/* Primary thresholds array */
 	struct mem_cgroup_threshold_ary *primary;
 	/*
 	 * Spare threshold array.
 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
 	 * It must be able to store at least primary->size - 1 entries.
 	 */
 	struct mem_cgroup_threshold_ary *spare;
 };
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
 	struct eventfd_ctx *eventfd;
 };
 /*
  * cgroup_event represents events which userspace want to receive.
  */
 struct mem_cgroup_event {
 	/*
 	 * memcg which the event belongs to.
 	 */
 	struct mem_cgroup *memcg;
 	/*
 	 * eventfd to signal userspace about the event.
 	 */
 	struct eventfd_ctx *eventfd;
 	/*
 	 * Each of these stored in a list by the cgroup.
 	 */
 	struct list_head list;
 	/*
 	 * register_event() callback will be used to add new userspace
 	 * waiter for changes related to this event.  Use eventfd_signal()
 	 * on eventfd to send notification to userspace.
 	 */
 	int (*register_event)(struct mem_cgroup *memcg,
 			      struct eventfd_ctx *eventfd, const char *args);
 	/*
 	 * unregister_event() callback will be called when userspace closes
 	 * the eventfd or on cgroup removing.  This callback must be set,
 	 * if you want provide notification functionality.
 	 */
 	void (*unregister_event)(struct mem_cgroup *memcg,
 				 struct eventfd_ctx *eventfd);
 	/*
 	 * All fields below needed to unregister event when
 	 * userspace closes eventfd.
 	 */
 	poll_table pt;
 	wait_queue_head_t *wqh;
 	wait_queue_t wait;
 	struct work_struct remove;
 };
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/*
 	 * the counter to account for memory usage
 	 */
 	struct res_counter res;
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
 	/*
 	 * the counter to account for mem+swap usage.
 	 */
 	struct res_counter memsw;
 	/*
 	 * the counter to account for kernel memory usage.
 	 */
 	struct res_counter kmem;
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
 	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	oom_wakeups;
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 	/* thresholds for memory usage. RCU-protected */
 	struct mem_cgroup_thresholds thresholds;
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 	/*
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	unsigned long move_charge_at_immigrate;
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
 	atomic_t	moving_account;
 	/* taken only while moving_account > 0 */
 	spinlock_t	move_lock;
 	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
 	/*
 	 * used when a cpu is offlined or other synchronizations
 	 * See mem_cgroup_read_stat().
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 	atomic_t	dead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
 	/* analogous to slab_common's slab_caches list. per-memcg */
 	struct list_head memcg_slab_caches;
 	/* Not a spinlock, we can take a lot of time walking the list */
 	struct mutex slab_caches_mutex;
         /* Index in the kmem_cache->memcg_params->memcg_caches array */
 	int kmemcg_id;
 #endif
 	int last_scanned_node;
 #if MAX_NUMNODES > 1
 	nodemask_t	scan_nodes;
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 #endif
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 };
 /* internal only representation about the status of kmem accounting. */
 enum {
 	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
 	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
 #ifdef CONFIG_MEMCG_KMEM
 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 {
 	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
 	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
 	/*
 	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
 	 * will call css_put() if it sees the memcg is dead.
 	 */
 	smp_wmb();
 	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
 		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 }
 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 {
 	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
 				  &memcg->kmem_account_flags);
 }
 #endif
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved. "move_charge_at_immitgrate" and
  * "immigrate_flags" are treated as a left-shifted bitmap of these types.
  */
 enum move_type {
 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
 	NR_MOVE_TYPE,
 };
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
 	spinlock_t	  lock; /* for from, to */
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	unsigned long immigrate_flags;
 	unsigned long precharge;
 	unsigned long moved_charge;
 	unsigned long moved_swap;
 	struct task_struct *moving_task;	/* a task moving charges */
 	wait_queue_head_t waitq;		/* a waitq for other context */
 } mc = {
 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 static bool move_anon(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
 }
 static bool move_file(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
 }
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_ANON,
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 /* for encoding cft->private value on file */
 enum res_type {
 	_MEM,
 	_MEMSWAP,
 	_OOM_TYPE,
 	_KMEM,
 };
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 /*
  * Reclaim flags for mem_cgroup_hierarchical_reclaim
  */
 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
 #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
 #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
 #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
 /*
  * The memcg_create_mutex will be held whenever a new cgroup is created.
  * As a consequence, any change that needs to protect against new child cgroups
  * appearing has to hold it as well.
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 {
 	return s ? container_of(s, struct mem_cgroup, css) : NULL;
 }
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
 	if (!memcg)
 		memcg = root_mem_cgroup;
 	return &memcg->vmpressure;
 }
 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 {
 	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 }
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
 }
 /*
  * We restrict the id in the range of [1, 65535], so it can fit into
  * an unsigned short.
  */
 #define MEM_CGROUP_ID_MAX	USHRT_MAX
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
 	/*
 	 * The ID of the root cgroup is 0, but memcg treat 0 as an
 	 * invalid ID, so we return (cgroup_id + 1).
 	 */
 	return memcg->css.cgroup->id + 1;
 }
 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 	css = css_from_id(id - 1, &memory_cgrp_subsys);
 	return mem_cgroup_from_css(css);
 }
 /* Writing them here to avoid exposing memcg's inner layout */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
 		struct cg_proto *cg_proto;
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 		/* Socket cloning can throw us here with sk_cgrp already
 		 * filled. It won't however, necessarily happen from
 		 * process context. So the test for root memcg given
 		 * the current task's memcg won't help us in this case.
 		 *
 		 * Respecting the original socket's memcg is a better
 		 * decision in this case.
 		 */
 		if (sk->sk_cgrp) {
 			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 			css_get(&sk->sk_cgrp->memcg->css);
 			return;
 		}
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
 		if (!mem_cgroup_is_root(memcg) &&
 		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
 }
 EXPORT_SYMBOL(sock_update_memcg);
 void sock_release_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 		struct mem_cgroup *memcg;
 		WARN_ON(!sk->sk_cgrp->memcg);
 		memcg = sk->sk_cgrp->memcg;
 		css_put(&sk->sk_cgrp->memcg->css);
 	}
 }
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg || mem_cgroup_is_root(memcg))
 		return NULL;
 	return &memcg->tcp_mem;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 	if (!memcg_proto_activated(&memcg->tcp_mem))
 		return;
 	static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 #else
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 }
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
  *  but only a few kmem-limited. Or also, if we have, for instance, 200
  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
  *  200 entry array for that.
  *
  * The current size of the caches array is stored in
  * memcg_limited_groups_array_size.  It will double each time we have to
  * increase it.
  */
 static DEFINE_IDA(kmem_limited_groups);
 int memcg_limited_groups_array_size;
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
  * the alloc/free process all the time. In a small machine, 4 kmem-limited
  * cgroups is a reasonable guess. In the future, it could be a parameter or
  * tunable, but that is strictly not necessary.
  *
  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
  * this constant directly from cgroup, but it is understandable that this is
  * better kept as an internal representation in cgroup.c. In any case, the
  * cgrp_id space is not getting any smaller, and we don't have to necessarily
  * increase ours as well if it increases.
  */
 #define MEMCG_CACHES_MIN_SIZE 4
 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 /*
  * A lot of the calls to the cache allocation functions are expected to be
  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
  * conditional to this static branch, we'll have to allow modules that does
  * kmem_cache_alloc and the such to see this symbol as well
  */
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
 	if (memcg_kmem_is_active(memcg)) {
 		static_key_slow_dec(&memcg_kmem_enabled_key);
 		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
 	}
 	/*
 	 * This check can't live in kmem destruction function,
 	 * since the charges will outlive the cgroup
 	 */
 	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
 }
 #else
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
 static void disarm_static_keys(struct mem_cgroup *memcg)
 {
 	disarm_sock_keys(memcg);
 	disarm_kmem_keys(memcg);
 }
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
 	VM_BUG_ON((unsigned)nid >= nr_node_ids);
 	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 {
 	return &memcg->css;
 }
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return mem_cgroup_zoneinfo(memcg, nid, zid);
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_node_zone(int nid, int zid)
 {
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_from_page(struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static void
 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz,
 				unsigned long long new_usage_in_excess)
 {
 	struct rb_node **p = &mctz->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct mem_cgroup_per_zone *mz_node;
 	if (mz->on_tree)
 		return;
 	mz->usage_in_excess = new_usage_in_excess;
 	if (!mz->usage_in_excess)
 		return;
 	while (*p) {
 		parent = *p;
 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 					tree_node);
 		if (mz->usage_in_excess < mz_node->usage_in_excess)
 			p = &(*p)->rb_left;
 		/*
 		 * We can't avoid mem cgroups that are over their soft
 		 * limit by the same amount
 		 */
 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 			p = &(*p)->rb_right;
 	}
 	rb_link_node(&mz->tree_node, parent, p);
 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = true;
 }
 static void
 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	if (!mz->on_tree)
 		return;
 	rb_erase(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = false;
 }
 static void
 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
 	spin_lock(&mctz->lock);
 	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 	spin_unlock(&mctz->lock);
 }
 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
 	unsigned long long excess;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	mctz = soft_limit_tree_from_page(page);
 	/*
 	 * Necessary to update all ancestors when hierarchy is used.
 	 * because their event counter is not touched.
 	 */
 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 		excess = res_counter_soft_limit_excess(&memcg->res);
 		/*
 		 * We have to update the tree if mz is on RB-tree or
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
 			spin_lock(&mctz->lock);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
 			/*
 			 * Insert again. mz->usage_in_excess will be updated.
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
 			spin_unlock(&mctz->lock);
 		}
 	}
 }
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 {
 	int node, zone;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	for_each_node(node) {
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			mz = mem_cgroup_zoneinfo(memcg, node, zone);
 			mctz = soft_limit_tree_node_zone(node, zone);
 			mem_cgroup_remove_exceeded(memcg, mz, mctz);
 		}
 	}
 }
 static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct rb_node *rightmost = NULL;
 	struct mem_cgroup_per_zone *mz;
 retry:
 	mz = NULL;
 	rightmost = rb_last(&mctz->rb_root);
 	if (!rightmost)
 		goto done;		/* Nothing to reclaim from */
 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 	/*
 	 * Remove the node now but someone else can add it back,
 	 * we will to add it back at the end of reclaim to its correct
 	 * position in the tree.
 	 */
 	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
 		!css_tryget(&mz->memcg->css))
 		goto retry;
 done:
 	return mz;
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 	spin_lock(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 	spin_unlock(&mctz->lock);
 	return mz;
 }
 /*
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
  * a periodic synchronizion of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
  * have to visit all online cpus and make sum. So, for now, unnecessary
  * synchronization is not implemented. (just implemented for cpu hotplug)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threashold and synchonization as vmstat[] should be
  * implemented.
  */
 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.count[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 					    enum mem_cgroup_events_index idx)
 {
 	unsigned long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.events[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 struct page *page,
 					 bool anon, int nr_pages)
 {
 	/*
 	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
 	if (anon)
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 				nr_pages);
 	else
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
 	if (PageTransHuge(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 				nr_pages);
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 	else {
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 }
 unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	return mz->lru_size[lru];
 }
 static unsigned long
 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
 			unsigned int lru_mask)
 {
 	struct mem_cgroup_per_zone *mz;
 	enum lru_list lru;
 	unsigned long ret = 0;
 	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 	for_each_lru(lru) {
 		if (BIT(lru) & lru_mask)
 			ret += mz->lru_size[lru];
 	}
 	return ret;
 }
 static unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			int nid, unsigned int lru_mask)
 {
 	u64 total = 0;
 	int zid;
 	for (zid = 0; zid < MAX_NR_ZONES; zid++)
 		total += mem_cgroup_zone_nr_lru_pages(memcg,
 						nid, zid, lru_mask);
 	return total;
 }
 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 			unsigned int lru_mask)
 {
 	int nid;
 	u64 total = 0;
 	for_each_node_state(nid, N_MEMORY)
 		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 	return total;
 }
 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 				       enum mem_cgroup_events_target target)
 {
 	unsigned long val, next;
 	val = __this_cpu_read(memcg->stat->nr_page_events);
 	next = __this_cpu_read(memcg->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	if ((long)next - (long)val < 0) {
 		switch (target) {
 		case MEM_CGROUP_TARGET_THRESH:
 			next = val + THRESHOLDS_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_SOFTLIMIT:
 			next = val + SOFTLIMIT_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_NUMAINFO:
 			next = val + NUMAINFO_EVENTS_TARGET;
 			break;
 		default:
 			break;
 		}
 		__this_cpu_write(memcg->stat->targets[target], next);
 		return true;
 	}
 	return false;
 }
 /*
  * Check events in order.
  *
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
 	preempt_disable();
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_THRESH))) {
 		bool do_softlimit;
 		bool do_numainfo __maybe_unused;
 		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_SOFTLIMIT);
 #if MAX_NUMNODES > 1
 		do_numainfo = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_NUMAINFO);
 #endif
 		preempt_enable();
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 			mem_cgroup_update_tree(memcg, page);
 #if MAX_NUMNODES > 1
 		if (unlikely(do_numainfo))
 			atomic_inc(&memcg->numainfo_events);
 #endif
 	} else
 		preempt_enable();
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
 	 * mm_update_next_owner() may clear mm->owner to NULL
 	 * if it races with swapoff, page migration, etc.
 	 * So this can be called with p == NULL.
 	 */
 	if (unlikely(!p))
 		return NULL;
 	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *memcg = NULL;
 	rcu_read_lock();
 	do {
 		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 		if (unlikely(!memcg))
 			memcg = root_mem_cgroup;
 	} while (!css_tryget(&memcg->css));
 	rcu_read_unlock();
 	return memcg;
 }
 /*
  * Returns a next (in a pre-order walk) alive memcg (with elevated css
  * ref. count) or NULL if the whole root's subtree has been visited.
  *
  * helper function to be used by mem_cgroup_iter
  */
 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
 		struct mem_cgroup *last_visited)
 {
 	struct cgroup_subsys_state *prev_css, *next_css;
 	prev_css = last_visited ? &last_visited->css : NULL;
 skip_node:
 	next_css = css_next_descendant_pre(prev_css, &root->css);
 	/*
 	 * Even if we found a group we have to make sure it is
 	 * alive. css && !memcg means that the groups should be
 	 * skipped and we should continue the tree walk.
 	 * last_visited css is safe to use because it is
 	 * protected by css_get and the tree walk is rcu safe.
 	 *
 	 * We do not take a reference on the root of the tree walk
 	 * because we might race with the root removal when it would
 	 * be the only node in the iterated hierarchy and mem_cgroup_iter
 	 * would end up in an endless loop because it expects that at
 	 * least one valid node will be returned. Root cannot disappear
 	 * because caller of the iterator should hold it already so
 	 * skipping css reference should be safe.
 	 */
 	if (next_css) {
 		if ((next_css == &root->css) ||
 		    ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
 			return mem_cgroup_from_css(next_css);
 		prev_css = next_css;
 		goto skip_node;
 	}
 	return NULL;
 }
 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
 {
 	/*
 	 * When a group in the hierarchy below root is destroyed, the
 	 * hierarchy iterator can no longer be trusted since it might
 	 * have pointed to the destroyed group.  Invalidate it.
 	 */
 	atomic_inc(&root->dead_count);
 }
 static struct mem_cgroup *
 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 		     struct mem_cgroup *root,
 		     int *sequence)
 {
 	struct mem_cgroup *position = NULL;
 	/*
 	 * A cgroup destruction happens in two stages: offlining and
 	 * release.  They are separated by a RCU grace period.
 	 *
 	 * If the iterator is valid, we may still race with an
 	 * offlining.  The RCU lock ensures the object won't be
 	 * released, tryget will fail if we lost the race.
 	 */
 	*sequence = atomic_read(&root->dead_count);
 	if (iter->last_dead_count == *sequence) {
 		smp_rmb();
 		position = iter->last_visited;
 		/*
 		 * We cannot take a reference to root because we might race
 		 * with root removal and returning NULL would end up in
 		 * an endless loop on the iterator user level when root
 		 * would be returned all the time.
 		 */
 		if (position && position != root &&
 				!css_tryget(&position->css))
 			position = NULL;
 	}
 	return position;
 }
 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
 				   struct mem_cgroup *last_visited,
 				   struct mem_cgroup *new_position,
 				   struct mem_cgroup *root,
 				   int sequence)
 {
 	/* root reference counting symmetric to mem_cgroup_iter_load */
 	if (last_visited && last_visited != root)
 		css_put(&last_visited->css);
 	/*
 	 * We store the sequence count from the time @last_visited was
 	 * loaded successfully instead of rereading it here so that we
 	 * don't lose destruction events in between.  We could have
 	 * raced with the destruction of @new_position after all.
 	 */
 	iter->last_visited = new_position;
 	smp_wmb();
 	iter->last_dead_count = sequence;
 }
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
  *
  * Caller must pass the return value in @prev on subsequent
  * invocations for reference counting, or use mem_cgroup_iter_break()
  * to cancel a hierarchy walk before the round-trip is complete.
  *
  * Reclaimers can specify a zone and a priority level in @reclaim to
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  */
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *last_visited = NULL;
 	if (mem_cgroup_disabled())
 		return NULL;
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && !reclaim)
 		last_visited = prev;
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
 			goto out_css_put;
 		return root;
 	}
 	rcu_read_lock();
 	while (!memcg) {
 		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 		int uninitialized_var(seq);
 		if (reclaim) {
 			int nid = zone_to_nid(reclaim->zone);
 			int zid = zone_idx(reclaim->zone);
 			struct mem_cgroup_per_zone *mz;
 			mz = mem_cgroup_zoneinfo(root, nid, zid);
 			iter = &mz->reclaim_iter[reclaim->priority];
 			if (prev && reclaim->generation != iter->generation) {
 				iter->last_visited = NULL;
 				goto out_unlock;
 			}
 			last_visited = mem_cgroup_iter_load(iter, root, &seq);
 		}
 		memcg = __mem_cgroup_iter_next(root, last_visited);
 		if (reclaim) {
 			mem_cgroup_iter_update(iter, last_visited, memcg, root,
 					seq);
 			if (!memcg)
 				iter->generation++;
 			else if (!prev && memcg)
 				reclaim->generation = iter->generation;
 		}
 		if (prev && !memcg)
 			goto out_unlock;
 	}
 out_unlock:
 	rcu_read_unlock();
 out_css_put:
 	if (prev && prev != root)
 		css_put(&prev->css);
 	return memcg;
 }
 /**
  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
  * @root: hierarchy root
  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
  */
 void mem_cgroup_iter_break(struct mem_cgroup *root,
 			   struct mem_cgroup *prev)
 {
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && prev != root)
 		css_put(&prev->css);
 }
 /*
  * Iteration constructs for visiting all cgroups (under a tree).  If
  * loops are exited prematurely (break), mem_cgroup_iter_break() must
  * be used for reference counting.
  */
 #define for_each_mem_cgroup_tree(iter, root)		\
 	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(root, iter, NULL))
 #define for_each_mem_cgroup(iter)			\
 	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	if (unlikely(!memcg))
 		goto out;
 	switch (idx) {
 	case PGFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
 		break;
 	case PGMAJFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
 		break;
 	default:
 		BUG();
 	}
 out:
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
  * @memcg: memcg of the wanted lruvec
  *
  * Returns the lru list vector holding pages for the given @zone and
  * @mem.  This can be the global zone lruvec, if the memory controller
  * is disabled.
  */
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct lruvec *lruvec;
 	if (mem_cgroup_disabled()) {
 		lruvec = &zone->lruvec;
 		goto out;
 	}
 	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
 	lruvec = &mz->lruvec;
 out:
 	/*
 	 * Since a node can be onlined after the mem_cgroup was created,
 	 * we have to be prepared to initialize lruvec->zone here;
 	 * and if offlined then reonlined, we need to reinitialize it.
 	 */
 	if (unlikely(lruvec->zone != zone))
 		lruvec->zone = zone;
 	return lruvec;
 }
 /*
  * Following LRU functions are allowed to be used without PCG_LOCK.
  * Operations are called by routine of global LRU independently from memcg.
  * What we have to take care of here is validness of pc->mem_cgroup.
  *
  * Changes to pc->mem_cgroup happens when
  * 1. charge
  * 2. moving account
  * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
  * It is added to LRU before charge.
  * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
  * When moving account, the page is not on LRU. It's isolated.
  */
 /**
  * mem_cgroup_page_lruvec - return lruvec for adding an lru page
  * @page: the page
  * @zone: zone of the page
  */
 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	struct lruvec *lruvec;
 	if (mem_cgroup_disabled()) {
 		lruvec = &zone->lruvec;
 		goto out;
 	}
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
 	/*
 	 * Surreptitiously switch any uncharged offlist page to root:
 	 * an uncharged page off lru does nothing to secure
 	 * its former mem_cgroup from sudden removal.
 	 *
 	 * Our caller holds lru_lock, and PageCgroupUsed is updated
 	 * under page_cgroup lock: between them, they make all uses
 	 * of pc->mem_cgroup safe.
 	 */
 	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 	mz = page_cgroup_zoneinfo(memcg, page);
 	lruvec = &mz->lruvec;
 out:
 	/*
 	 * Since a node can be onlined after the mem_cgroup was created,
 	 * we have to be prepared to initialize lruvec->zone here;
 	 * and if offlined then reonlined, we need to reinitialize it.
 	 */
 	if (unlikely(lruvec->zone != zone))
 		lruvec->zone = zone;
 	return lruvec;
 }
 /**
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
  * @lru: index of lru list the page is sitting on
  * @nr_pages: positive when adding or negative when removing
  *
  * This function must be called when a page is added to or removed from an
  * lru list.
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
 	if (mem_cgroup_disabled())
 		return;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	lru_size = mz->lru_size + lru;
 	*lru_size += nr_pages;
 	VM_BUG_ON((long)(*lru_size) < 0);
 }
 /*
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
  */
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg)
 {
 	if (root_memcg == memcg)
 		return true;
 	if (!root_memcg->use_hierarchy || !memcg)
 		return false;
 	return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
 }
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				       struct mem_cgroup *memcg)
 {
 	bool ret;
 	rcu_read_lock();
 	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
 	rcu_read_unlock();
 	return ret;
 }
 bool task_in_mem_cgroup(struct task_struct *task,
 			const struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *curr = NULL;
 	struct task_struct *p;
 	bool ret;
 	p = find_lock_task_mm(task);
 	if (p) {
 		curr = get_mem_cgroup_from_mm(p->mm);
 		task_unlock(p);
 	} else {
 		/*
 		 * All threads may have already detached their mm's, but the oom
 		 * killer still needs to detect if they have already been oom
 		 * killed to prevent needlessly killing additional tasks.
 		 */
 		rcu_read_lock();
 		curr = mem_cgroup_from_task(task);
 		if (curr)
 			css_get(&curr->css);
 		rcu_read_unlock();
 	}
 	/*
 	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
 	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
 	 * hierarchy(even if use_hierarchy is disabled in "memcg").
 	 */
 	ret = mem_cgroup_same_or_subtree(memcg, curr);
 	css_put(&curr->css);
 	return ret;
 }
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long gb;
 	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
 	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
 		inactive_ratio = int_sqrt(10 * gb);
 	else
 		inactive_ratio = 1;
 	return inactive * inactive_ratio < active;
 }
 #define mem_cgroup_from_res_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @memcg: the memory cgroup
  *
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 {
 	unsigned long long margin;
 	margin = res_counter_margin(&memcg->res);
 	if (do_swap_account)
 		margin = min(margin, res_counter_margin(&memcg->memsw));
 	return margin >> PAGE_SHIFT;
 }
 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
 	/* root ? */
 	if (!css_parent(&memcg->css))
 		return vm_swappiness;
 	return memcg->swappiness;
 }
 /*
  * memcg->moving_account is used for checking possibility that some thread is
  * calling move_account(). When a thread on CPU-A starts moving pages under
  * a memcg, other threads should check memcg->moving_account under
  * rcu_read_lock(), like this:
  *
  *         CPU-A                                    CPU-B
  *                                              rcu_read_lock()
  *         memcg->moving_account+1              if (memcg->mocing_account)
  *                                                   take heavy locks.
  *         synchronize_rcu()                    update something.
  *                                              rcu_read_unlock()
  *         start move here.
  */
 /* for quick checking without looking up memcg */
 atomic_t memcg_moving __read_mostly;
 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg_moving);
 	atomic_inc(&memcg->moving_account);
 	synchronize_rcu();
 }
 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
 	/*
 	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
 	 * We check NULL in callee rather than caller.
 	 */
 	if (memcg) {
 		atomic_dec(&memcg_moving);
 		atomic_dec(&memcg->moving_account);
 	}
 }
 /*
  * 2 routines for checking "mem" is under move_account() or not.
  *
  * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
  *			  is used for avoiding races in accounting.  If true,
  *			  pc->mem_cgroup may be overwritten.
  *
  * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
  *			  under hierarchy of moving cgroups. This is for
  *			  waiting at hith-memory prressure caused by "move".
  */
 static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
 {
 	VM_BUG_ON(!rcu_read_lock_held());
 	return atomic_read(&memcg->moving_account) > 0;
 }
 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	bool ret = false;
 	/*
 	 * Unlike task_move routines, we access mc.to, mc.from not under
 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
 	 */
 	spin_lock(&mc.lock);
 	from = mc.from;
 	to = mc.to;
 	if (!from)
 		goto unlock;
 	ret = mem_cgroup_same_or_subtree(memcg, from)
 		|| mem_cgroup_same_or_subtree(memcg, to);
 unlock:
 	spin_unlock(&mc.lock);
 	return ret;
 }
 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 {
 	if (mc.moving_task && current != mc.moving_task) {
 		if (mem_cgroup_under_move(memcg)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
 			/* moving charge context might have finished. */
 			if (mc.moving_task)
 				schedule();
 			finish_wait(&mc.waitq, &wait);
 			return true;
 		}
 	}
 	return false;
 }
 /*
  * Take this lock when
  * - a code tries to modify page's memcg while it's USED.
  * - a code tries to modify page state accounting in a memcg.
  * see mem_cgroup_stolen(), too.
  */
 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
 				  unsigned long *flags)
 {
 	spin_lock_irqsave(&memcg->move_lock, *flags);
 }
 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 				unsigned long *flags)
 {
 	spin_unlock_irqrestore(&memcg->move_lock, *flags);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /**
  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
  * @memcg: The memory cgroup that went over limit
  * @p: Task that is going to be killed
  *
  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
  * enabled
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 	/* oom_info_lock ensures that parallel ooms do not interleave */
 	static DEFINE_MUTEX(oom_info_lock);
 	struct mem_cgroup *iter;
 	unsigned int i;
 	if (!p)
 		return;
 	mutex_lock(&oom_info_lock);
 	rcu_read_lock();
 	pr_info("Task in ");
 	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
 	pr_info(" killed as a result of limit of ");
 	pr_cont_cgroup_path(memcg->css.cgroup);
 	pr_info("\n");
 	rcu_read_unlock();
 	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->res, RES_FAILCNT));
 	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
 	pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
 		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
 		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
 		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 	for_each_mem_cgroup_tree(iter, memcg) {
 		pr_info("Memory cgroup stats for ");
 		pr_cont_cgroup_path(iter->css.cgroup);
 		pr_cont(":");
 		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 				continue;
 			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
 				K(mem_cgroup_read_stat(iter, i)));
 		}
 		for (i = 0; i < NR_LRU_LISTS; i++)
 			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
 				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
 		pr_cont("\n");
 	}
 	mutex_unlock(&oom_info_lock);
 }
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
  */
 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 {
 	int num = 0;
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		num++;
 	return num;
 }
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	u64 limit;
 	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	/*
 	 * Do not consider swap space if we cannot swap due to swappiness
 	 */
 	if (mem_cgroup_swappiness(memcg)) {
 		u64 memsw;
 		limit += total_swap_pages << PAGE_SHIFT;
 		memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		/*
 		 * If memsw is finite and limits the amount of swap space
 		 * available to this memcg, return that limit.
 		 */
 		limit = min(limit, memsw);
 	}
 	return limit;
 }
 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
 	struct mem_cgroup *iter;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
 		set_thread_flag(TIF_MEMDIE);
 		return;
 	}
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
 	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct css_task_iter it;
 		struct task_struct *task;
 		css_task_iter_start(&iter->css, &it);
 		while ((task = css_task_iter_next(&it))) {
 			switch (oom_scan_process_thread(task, totalpages, NULL,
 							false)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = ULONG_MAX;
 				get_task_struct(chosen);
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
 				continue;
 			case OOM_SCAN_ABORT:
 				css_task_iter_end(&it);
 				mem_cgroup_iter_break(memcg, iter);
 				if (chosen)
 					put_task_struct(chosen);
 				return;
 			case OOM_SCAN_OK:
 				break;
 			};
 			points = oom_badness(task, memcg, NULL, totalpages);
 			if (!points || points < chosen_points)
 				continue;
 			/* Prefer thread group leaders for display purposes */
 			if (points == chosen_points &&
 			    thread_group_leader(chosen))
 				continue;
 			if (chosen)
 				put_task_struct(chosen);
 			chosen = task;
 			chosen_points = points;
 			get_task_struct(chosen);
 		}
 		css_task_iter_end(&it);
 	}
 	if (!chosen)
 		return;
 	points = chosen_points * 1000 / totalpages;
 	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
 }
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 					gfp_t gfp_mask,
 					unsigned long flags)
 {
 	unsigned long total = 0;
 	bool noswap = false;
 	int loop;
 	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
 		noswap = true;
 	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
 		noswap = true;
 	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
 		if (loop)
 			drain_all_stock_async(memcg);
 		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
 		/*
 		 * Allow limit shrinkers, which are triggered directly
 		 * by userspace, to catch signals and stop reclaim
 		 * after minimal progress, regardless of the margin.
 		 */
 		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
 			break;
 		if (mem_cgroup_margin(memcg))
 			break;
 		/*
 		 * If nothing was reclaimed after two attempts, there
 		 * may be no reclaimable pages in this hierarchy.
 		 */
 		if (loop && !total)
 			break;
 	}
 	return total;
 }
 /**
  * test_mem_cgroup_node_reclaimable
  * @memcg: the target memcg
  * @nid: the node ID to be checked.
  * @noswap : specify true here if the user wants flle only information.
  *
  * This function returns whether the specified memcg contains any
  * reclaimable pages on a node. Returns true if there are any reclaimable
  * pages in the node.
  */
 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 		int nid, bool noswap)
 {
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
 		return true;
 	return false;
 }
 #if MAX_NUMNODES > 1
 /*
  * Always updating the nodemask is not very good - even if we have an empty
  * list or the wrong list here, we can start from some node and traverse all
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
 {
 	int nid;
 	/*
 	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
 	 * pagein/pageout changes since the last update.
 	 */
 	if (!atomic_read(&memcg->numainfo_events))
 		return;
 	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
 		return;
 	/* make a nodemask where this memcg uses memory from */
 	memcg->scan_nodes = node_states[N_MEMORY];
 	for_each_node_mask(nid, node_states[N_MEMORY]) {
 		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
 			node_clear(nid, memcg->scan_nodes);
 	}
 	atomic_set(&memcg->numainfo_events, 0);
 	atomic_set(&memcg->numainfo_updating, 0);
 }
 /*
  * Selecting a node where we start reclaim from. Because what we need is just
  * reducing usage counter, start from anywhere is O,K. Considering
  * memory reclaim from current node, there are pros. and cons.
  *
  * Freeing memory from current node means freeing memory from a node which
  * we'll use or we've used. So, it may make LRU bad. And if several threads
  * hit limits, it will see a contention on a node. But freeing from remote
  * node means more costs for memory reclaim because of memory latency.
  *
  * Now, we use round-robin. Better algorithm is welcomed.
  */
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 	node = next_node(node, memcg->scan_nodes);
 	if (node == MAX_NUMNODES)
 		node = first_node(memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
 	 * memcg is too small and all pages are not on LRU. In that case,
 	 * we use curret node.
 	 */
 	if (unlikely(node == MAX_NUMNODES))
 		node = numa_node_id();
 	memcg->last_scanned_node = node;
 	return node;
 }
 /*
  * Check all nodes whether it contains reclaimable pages or not.
  * For quick scan, we make use of scan_nodes. This will allow us to skip
  * unused nodes. But scan_nodes is lazily updated and may not cotain
  * enough new information. We need to do double check.
  */
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	int nid;
 	/*
 	 * quick check...making use of scan_node.
 	 * We can skip unused nodes.
 	 */
 	if (!nodes_empty(memcg->scan_nodes)) {
 		for (nid = first_node(memcg->scan_nodes);
 		     nid < MAX_NUMNODES;
 		     nid = next_node(nid, memcg->scan_nodes)) {
 			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 				return true;
 		}
 	}
 	/*
 	 * Check rest of nodes.
 	 */
 	for_each_node_state(nid, N_MEMORY) {
 		if (node_isset(nid, memcg->scan_nodes))
 			continue;
 		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 			return true;
 	}
 	return false;
 }
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	return 0;
 }
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
 #endif
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 				   struct zone *zone,
 				   gfp_t gfp_mask,
 				   unsigned long *total_scanned)
 {
 	struct mem_cgroup *victim = NULL;
 	int total = 0;
 	int loop = 0;
 	unsigned long excess;
 	unsigned long nr_scanned;
 	struct mem_cgroup_reclaim_cookie reclaim = {
 		.zone = zone,
 		.priority = 0,
 	};
 	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
 	while (1) {
 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
 		if (!victim) {
 			loop++;
 			if (loop >= 2) {
 				/*
 				 * If we have not been able to reclaim
 				 * anything, it might because there are
 				 * no reclaimable pages under this hierarchy
 				 */
 				if (!total)
 					break;
 				/*
 				 * We want to do more targeted reclaim.
 				 * excess >> 2 is not to excessive so as to
 				 * reclaim too much, nor too less that we keep
 				 * coming back to reclaim from this cgroup
 				 */
 				if (total >= (excess >> 2) ||
 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
 					break;
 			}
 			continue;
 		}
 		if (!mem_cgroup_reclaimable(victim, false))
 			continue;
 		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
 						     zone, &nr_scanned);
 		*total_scanned += nr_scanned;
 		if (!res_counter_soft_limit_excess(&root_memcg->res))
 			break;
 	}
 	mem_cgroup_iter_break(root_memcg, victim);
 	return total;
 }
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map memcg_oom_lock_dep_map = {
 	.name = "memcg_oom_lock",
 };
 #endif
 static DEFINE_SPINLOCK(memcg_oom_lock);
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
  */
 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter, *failed = NULL;
 	spin_lock(&memcg_oom_lock);
 	for_each_mem_cgroup_tree(iter, memcg) {
 		if (iter->oom_lock) {
 			/*
 			 * this subtree of our hierarchy is already locked
 			 * so we cannot give a lock.
 			 */
 			failed = iter;
 			mem_cgroup_iter_break(memcg, iter);
 			break;
 		} else
 			iter->oom_lock = true;
 	}
 	if (failed) {
 		/*
 		 * OK, we failed to lock the whole subtree so we have
 		 * to clean up what we set up to the failing subtree
 		 */
 		for_each_mem_cgroup_tree(iter, memcg) {
 			if (iter == failed) {
 				mem_cgroup_iter_break(memcg, iter);
 				break;
 			}
 			iter->oom_lock = false;
 		}
 	} else
 		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
 	spin_unlock(&memcg_oom_lock);
 	return !failed;
 }
 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	spin_lock(&memcg_oom_lock);
 	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
 	for_each_mem_cgroup_tree(iter, memcg)
 		iter->oom_lock = false;
 	spin_unlock(&memcg_oom_lock);
 }
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_inc(&iter->under_oom);
 }
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	/*
 	 * When a new child is created while the hierarchy is under oom,
 	 * mem_cgroup_oom_lock() may not be called. We have to use
 	 * atomic_add_unless() here.
 	 */
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
 	struct mem_cgroup *memcg;
 	wait_queue_t	wait;
 };
 static int memcg_oom_wake_function(wait_queue_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
 	struct mem_cgroup *oom_wait_memcg;
 	struct oom_wait_info *oom_wait_info;
 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 	oom_wait_memcg = oom_wait_info->memcg;
 	/*
 	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
 	 * Then we can use css_is_ancestor without taking care of RCU.
 	 */
 	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
 		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, arg);
 }
 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg->oom_wakeups);
 	/* for filtering, pass "memcg" as argument. */
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
 	if (memcg && atomic_read(&memcg->under_oom))
 		memcg_wakeup_oom(memcg);
 }
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
 	if (!current->memcg_oom.may_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
 	 * don't want to block when potentially sitting on a callstack
 	 * that holds all kinds of filesystem and mm locks.
 	 *
 	 * Also, the caller may handle a failed allocation gracefully
 	 * (like optional page cache readahead) and so an OOM killer
 	 * invocation might not even be necessary.
 	 *
 	 * That's why we don't do anything here except remember the
 	 * OOM context and then deal with it at the end of the page
 	 * fault when the stack is unwound, the locks are released,
 	 * and when we know whether the fault was overall successful.
 	 */
 	css_get(&memcg->css);
 	current->memcg_oom.memcg = memcg;
 	current->memcg_oom.gfp_mask = mask;
 	current->memcg_oom.order = order;
 }
 /**
  * mem_cgroup_oom_synchronize - complete memcg OOM handling
  * @handle: actually kill/wait or just clean up the OOM state
  *
  * This has to be called at the end of a page fault if the memcg OOM
  * handler was enabled.
  *
  * Memcg supports userspace OOM handling where failed allocations must
  * sleep on a waitqueue until the userspace task resolves the
  * situation.  Sleeping directly in the charge context with all kinds
  * of locks held is not a good idea, instead we remember an OOM state
  * in the task and mem_cgroup_oom_synchronize() has to be called at
  * the end of the page fault to complete the OOM handling.
  *
  * Returns %true if an ongoing memcg OOM situation was detected and
  * completed, %false otherwise.
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
 	struct mem_cgroup *memcg = current->memcg_oom.memcg;
 	struct oom_wait_info owait;
 	bool locked;
 	/* OOM is global, do not handle */
 	if (!memcg)
 		return false;
 	if (!handle)
 		goto cleanup;
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	mem_cgroup_mark_under_oom(memcg);
 	locked = mem_cgroup_oom_trylock(memcg);
 	if (locked)
 		mem_cgroup_oom_notify(memcg);
 	if (locked && !memcg->oom_kill_disable) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
 					 current->memcg_oom.order);
 	} else {
 		schedule();
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	if (locked) {
 		mem_cgroup_oom_unlock(memcg);
 		/*
 		 * There is no guarantee that an OOM-lock contender
 		 * sees the wakeups triggered by the OOM kill
 		 * uncharges.  Wake any sleepers explicitely.
 		 */
 		memcg_oom_recover(memcg);
 	}
 cleanup:
 	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
 	return true;
 }
 /*
  * Currently used to update mapped file statistics, but the routine can be
  * generalized to update other statistics as well.
  *
  * Notes: Race condition
  *
  * We usually use page_cgroup_lock() for accessing page_cgroup member but
  * it tends to be costly. But considering some conditions, we doesn't need
  * to do so _always_.
  *
  * Considering "charge", lock_page_cgroup() is not required because all
  * file-stat operations happen after a page is attached to radix-tree. There
  * are no race with "charge".
  *
  * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
  * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
  * if there are race with "uncharge". Statistics itself is properly handled
  * by flags.
  *
  * Considering "move", this is an only case we see a race. To make the race
  * small, we check mm->moving_account and detect there are possibility of race
  * If there is, we take a lock.
  */
 void __mem_cgroup_begin_update_page_stat(struct page *page,
 				bool *locked, unsigned long *flags)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 again:
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return;
 	/*
 	 * If this memory cgroup is not under account moving, we don't
 	 * need to take move_lock_mem_cgroup(). Because we already hold
 	 * rcu_read_lock(), any calls to move_account will be delayed until
 	 * rcu_read_unlock() if mem_cgroup_stolen() == true.
 	 */
 	if (!mem_cgroup_stolen(memcg))
 		return;
 	move_lock_mem_cgroup(memcg, flags);
 	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
 		move_unlock_mem_cgroup(memcg, flags);
 		goto again;
 	}
 	*locked = true;
 }
 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	/*
 	 * It's guaranteed that pc->mem_cgroup never changes while
 	 * lock is held because a routine modifies pc->mem_cgroup
 	 * should take move_lock_mem_cgroup().
 	 */
 	move_unlock_mem_cgroup(pc->mem_cgroup, flags);
 }
 void mem_cgroup_update_page_stat(struct page *page,
 				 enum mem_cgroup_stat_index idx, int val)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	unsigned long uninitialized_var(flags);
 	if (mem_cgroup_disabled())
 		return;
 	VM_BUG_ON(!rcu_read_lock_held());
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return;
 	this_cpu_add(memcg->stat->count[idx], val);
 }
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
  */
 #define CHARGE_BATCH	32U
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
 	struct work_struct work;
 	unsigned long flags;
 #define FLUSHING_CACHED_CHARGE	0
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
 /**
  * consume_stock: Try to consume stocked charge on this cpu.
  * @memcg: memcg to consume from.
  * @nr_pages: how many pages to charge.
  *
  * The charges will only happen if @memcg matches the current cpu's memcg
  * stock, and at least @nr_pages are available in that stock.  Failure to
  * service an allocation will refill the stock.
  *
  * returns true if successful, false otherwise.
  */
 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = true;
 	if (nr_pages > CHARGE_BATCH)
 		return false;
 	stock = &get_cpu_var(memcg_stock);
 	if (memcg == stock->cached && stock->nr_pages >= nr_pages)
 		stock->nr_pages -= nr_pages;
 	else /* need to call res_counter_charge */
 		ret = false;
 	put_cpu_var(memcg_stock);
 	return ret;
 }
 /*
  * Returns stocks cached in percpu to res_counter and reset cached information.
  */
 static void drain_stock(struct memcg_stock_pcp *stock)
 {
 	struct mem_cgroup *old = stock->cached;
 	if (stock->nr_pages) {
 		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&old->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&old->memsw, bytes);
 		stock->nr_pages = 0;
 	}
 	stock->cached = NULL;
 }
 /*
  * This must be called under preempt disabled or must be called by
  * a thread which is pinned to local cpu.
  */
 static void drain_local_stock(struct work_struct *dummy)
 {
 	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
 	drain_stock(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 static void __init memcg_stock_init(void)
 {
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		struct memcg_stock_pcp *stock =
 					&per_cpu(memcg_stock, cpu);
 		INIT_WORK(&stock->work, drain_local_stock);
 	}
 }
 /*
  * Cache charges(val) which is from res_counter, to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = memcg;
 	}
 	stock->nr_pages += nr_pages;
 	put_cpu_var(memcg_stock);
 }
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it. sync flag says whether we should block
  * until the work is done.
  */
 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 {
 	int cpu, curcpu;
 	/* Notify other cpus that system-wide "drain" is running */
 	get_online_cpus();
 	curcpu = get_cpu();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
 		memcg = stock->cached;
 		if (!memcg || !stock->nr_pages)
 			continue;
 		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
 			continue;
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
 	}
 	put_cpu();
 	if (!sync)
 		goto out;
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
 			flush_work(&stock->work);
 	}
 out:
 	put_online_cpus();
 }
 /*
  * Tries to drain stocked charges in other cpus. This function is asynchronous
  * and just put a work per cpu for draining localy on each cpu. Caller can
  * expects some charges will be back to res_counter later but cannot wait for
  * it.
  */
 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
 {
 	/*
 	 * If someone calls draining, avoid adding more kworker runs.
 	 */
 	if (!mutex_trylock(&percpu_charge_mutex))
 		return;
 	drain_all_stock(root_memcg, false);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /* This is a synchronous drain interface. */
 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 {
 	/* called when force_empty is called */
 	mutex_lock(&percpu_charge_mutex);
 	drain_all_stock(root_memcg, true);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /*
  * This function drains percpu counter value from DEAD cpu and
  * move it to local cpu. Note that this function can be preempted.
  */
 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 {
 	int i;
 	spin_lock(&memcg->pcp_counter_lock);
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long x = per_cpu(memcg->stat->count[i], cpu);
 		per_cpu(memcg->stat->count[i], cpu) = 0;
 		memcg->nocpu_base.count[i] += x;
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
 		per_cpu(memcg->stat->events[i], cpu) = 0;
 		memcg->nocpu_base.events[i] += x;
 	}
 	spin_unlock(&memcg->pcp_counter_lock);
 }
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
 	struct mem_cgroup *iter;
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 	for_each_mem_cgroup(iter)
 		mem_cgroup_drain_pcp_counter(iter, cpu);
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 /* See mem_cgroup_try_charge() for details */
 enum {
 	CHARGE_OK,		/* success */
 	CHARGE_RETRY,		/* need to retry but retry is not bad */
 	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
 	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
 };
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				unsigned int nr_pages, unsigned int min_pages,
 				bool invoke_oom)
 {
 	unsigned long csize = nr_pages * PAGE_SIZE;
 	struct mem_cgroup *mem_over_limit;
 	struct res_counter *fail_res;
 	unsigned long flags = 0;
 	int ret;
 	ret = res_counter_charge(&memcg->res, csize, &fail_res);
 	if (likely(!ret)) {
 		if (!do_swap_account)
 			return CHARGE_OK;
 		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
 		if (likely(!ret))
 			return CHARGE_OK;
 		res_counter_uncharge(&memcg->res, csize);
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
 		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
 	/*
 	 * Never reclaim on behalf of optional batching, retry with a
 	 * single page instead.
 	 */
 	if (nr_pages > min_pages)
 		return CHARGE_RETRY;
 	if (!(gfp_mask & __GFP_WAIT))
 		return CHARGE_WOULDBLOCK;
 	if (gfp_mask & __GFP_NORETRY)
 		return CHARGE_NOMEM;
 	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		return CHARGE_RETRY;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
 	 * before killing the task.
 	 *
 	 * Only for regular pages, though: huge pages are rather
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
 	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
 		return CHARGE_RETRY;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		return CHARGE_RETRY;
 	if (invoke_oom)
 		mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
 	return CHARGE_NOMEM;
 }
 /**
  * mem_cgroup_try_charge - try charging a memcg
  * @memcg: memcg to charge
  * @nr_pages: number of pages to charge
  * @oom: trigger OOM if reclaim fails
  *
  * Returns 0 if @memcg was charged successfully, -EINTR if the charge
  * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
  */
 static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
 				 gfp_t gfp_mask,
 				 unsigned int nr_pages,
 				 bool oom)
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	int ret;
 	if (mem_cgroup_is_root(memcg))
 		goto done;
 	/*
 	 * Unlike in global OOM situations, memcg is not in a physical
 	 * memory shortage.  Allow dying and OOM-killed tasks to
 	 * bypass the last charges so that they can exit quickly and
 	 * free their memory.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
 		     fatal_signal_pending(current)))
 		goto bypass;
 	if (unlikely(task_in_memcg_oom(current)))
 		goto nomem;
 	if (gfp_mask & __GFP_NOFAIL)
 		oom = false;
 again:
 	if (consume_stock(memcg, nr_pages))
 		goto done;
 	do {
 		bool invoke_oom = oom && !nr_oom_retries;
 		/* If killed, bypass charge */
 		if (fatal_signal_pending(current))
 			goto bypass;
 		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
 					   nr_pages, invoke_oom);
 		switch (ret) {
 		case CHARGE_OK:
 			break;
 		case CHARGE_RETRY: /* not in OOM situation but retry */
 			batch = nr_pages;
 			goto again;
 		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
 			goto nomem;
 		case CHARGE_NOMEM: /* OOM routine works */
 			if (!oom || invoke_oom)
 				goto nomem;
 			nr_oom_retries--;
 			break;
 		}
 	} while (ret != CHARGE_OK);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
 done:
 	return 0;
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))
 		return -ENOMEM;
 bypass:
 	return -EINTR;
 }
 /**
  * mem_cgroup_try_charge_mm - try charging a mm
  * @mm: mm_struct to charge
  * @nr_pages: number of pages to charge
  * @oom: trigger OOM if reclaim fails
  *
  * Returns the charged mem_cgroup associated with the given mm_struct or
  * NULL the charge failed.
  */
 static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
 				 gfp_t gfp_mask,
 				 unsigned int nr_pages,
 				 bool oom)
 {
 	struct mem_cgroup *memcg;
 	int ret;
 	memcg = get_mem_cgroup_from_mm(mm);
 	ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
 	css_put(&memcg->css);
 	if (ret == -EINTR)
 		memcg = root_mem_cgroup;
 	else if (ret)
 		memcg = NULL;
 	return memcg;
 }
 /*
  * Somemtimes we have to undo a charge we got by try_charge().
  * This function is for that and do uncharge, put css's refcnt.
  * gotten by try_charge().
  */
 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 				       unsigned int nr_pages)
 {
 	if (!mem_cgroup_is_root(memcg)) {
 		unsigned long bytes = nr_pages * PAGE_SIZE;
 		res_counter_uncharge(&memcg->res, bytes);
 		if (do_swap_account)
 			res_counter_uncharge(&memcg->memsw, bytes);
 	}
 }
 /*
  * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
  * This is useful when moving usage to parent cgroup.
  */
 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
 					unsigned int nr_pages)
 {
 	unsigned long bytes = nr_pages * PAGE_SIZE;
 	if (mem_cgroup_is_root(memcg))
 		return;
 	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
 	if (do_swap_account)
 		res_counter_uncharge_until(&memcg->memsw,
 						memcg->memsw.parent, bytes);
 }
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock().  The caller is responsible for calling css_tryget if
  * the mem_cgroup is used for charging. (dropping refcnt from swap can be
  * called against removed memcg.)
  */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
 	/* ID 0 is unused ID */
 	if (!id)
 		return NULL;
 	return mem_cgroup_from_id(id);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		if (memcg && !css_tryget(&memcg->css))
 			memcg = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup_id(ent);
 		rcu_read_lock();
 		memcg = mem_cgroup_lookup(id);
 		if (memcg && !css_tryget(&memcg->css))
 			memcg = NULL;
 		rcu_read_unlock();
 	}
 	unlock_page_cgroup(pc);
 	return memcg;
 }
 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 				       struct page *page,
 				       unsigned int nr_pages,
 				       enum charge_type ctype,
 				       bool lrucare)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	struct zone *uninitialized_var(zone);
 	struct lruvec *lruvec;
 	bool was_on_lru = false;
 	bool anon;
 	lock_page_cgroup(pc);
 	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
 	 * accessed by any other context at this point.
 	 */
 	/*
 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
 	 * may already be on some other mem_cgroup's LRU.  Take care of it.
 	 */
 	if (lrucare) {
 		zone = page_zone(page);
 		spin_lock_irq(&zone->lru_lock);
 		if (PageLRU(page)) {
 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			ClearPageLRU(page);
 			del_page_from_lru_list(page, lruvec, page_lru(page));
 			was_on_lru = true;
 		}
 	}
 	pc->mem_cgroup = memcg;
 	/*
 	 * We access a page_cgroup asynchronously without lock_page_cgroup().
 	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
 	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
 	 * before USED bit, we need memory barrier here.
 	 * See mem_cgroup_add_lru_list(), etc.
 	 */
 	smp_wmb();
 	SetPageCgroupUsed(pc);
 	if (lrucare) {
 		if (was_on_lru) {
 			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
 			VM_BUG_ON_PAGE(PageLRU(page), page);
 			SetPageLRU(page);
 			add_page_to_lru_list(page, lruvec, page_lru(page));
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
 		anon = true;
 	else
 		anon = false;
 	mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
 	unlock_page_cgroup(pc);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
 	memcg_check_events(memcg, page);
 }
 static DEFINE_MUTEX(set_limit_mutex);
 #ifdef CONFIG_MEMCG_KMEM
 static DEFINE_MUTEX(activate_kmem_mutex);
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
 	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
 		memcg_kmem_is_active(memcg);
 }
 /*
  * This is a bit cumbersome, but it is rarely used and avoids a backpointer
  * in the memcg_cache_params struct.
  */
 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 {
 	struct kmem_cache *cachep;
 	VM_BUG_ON(p->is_root_cache);
 	cachep = p->root_cache;
 	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
 }
 #ifdef CONFIG_SLABINFO
 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	struct memcg_cache_params *params;
 	if (!memcg_can_account_kmem(memcg))
 		return -EIO;
 	print_slabinfo_header(m);
 	mutex_lock(&memcg->slab_caches_mutex);
 	list_for_each_entry(params, &memcg->memcg_slab_caches, list)
 		cache_show(memcg_params_to_cache(params), m);
 	mutex_unlock(&memcg->slab_caches_mutex);
 	return 0;
 }
 #endif
 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
 {
 	struct res_counter *fail_res;
 	int ret = 0;
 	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
 	if (ret)
 		return ret;
 	ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
 				    oom_gfp_allowed(gfp));
 	if (ret == -EINTR)  {
 		/*
 		 * mem_cgroup_try_charge() chosed to bypass to root due to
 		 * OOM kill or fatal signal.  Since our only options are to
 		 * either fail the allocation or charge it to this cgroup, do
 		 * it as a temporary condition. But we can't fail. From a
 		 * kmem/slab perspective, the cache has already been selected,
 		 * by mem_cgroup_kmem_get_cache(), so it is too late to change
 		 * our minds.
 		 *
 		 * This condition will only trigger if the task entered
 		 * memcg_charge_kmem in a sane state, but was OOM-killed during
 		 * mem_cgroup_try_charge() above. Tasks that were already
 		 * dying when the allocation triggers should have been already
 		 * directed to the root cgroup in memcontrol.h
 		 */
 		res_counter_charge_nofail(&memcg->res, size, &fail_res);
 		if (do_swap_account)
 			res_counter_charge_nofail(&memcg->memsw, size,
 						  &fail_res);
 		ret = 0;
 	} else if (ret)
 		res_counter_uncharge(&memcg->kmem, size);
 	return ret;
 }
 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
 {
 	res_counter_uncharge(&memcg->res, size);
 	if (do_swap_account)
 		res_counter_uncharge(&memcg->memsw, size);
 	/* Not down to 0 */
 	if (res_counter_uncharge(&memcg->kmem, size))
 		return;
 	/*
 	 * Releases a reference taken in kmem_cgroup_css_offline in case
 	 * this last uncharge is racing with the offlining code or it is
 	 * outliving the memcg existence.
 	 *
 	 * The memory barrier imposed by test&clear is paired with the
 	 * explicit one in memcg_kmem_mark_dead().
 	 */
 	if (memcg_kmem_test_and_clear_dead(memcg))
 		css_put(&memcg->css);
 }
 /*
  * helper for acessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
  * will return -1 when this is not a kmem-limited memcg.
  */
 int memcg_cache_id(struct mem_cgroup *memcg)
 {
 	return memcg ? memcg->kmemcg_id : -1;
 }
 static size_t memcg_caches_array_size(int num_groups)
 {
 	ssize_t size;
 	if (num_groups <= 0)
 		return 0;
 	size = 2 * num_groups;
 	if (size < MEMCG_CACHES_MIN_SIZE)
 		size = MEMCG_CACHES_MIN_SIZE;
 	else if (size > MEMCG_CACHES_MAX_SIZE)
 		size = MEMCG_CACHES_MAX_SIZE;
 	return size;
 }
 /*
  * We should update the current array size iff all caches updates succeed. This
  * can only be done from the slab side. The slab mutex needs to be held when
  * calling this.
  */
 void memcg_update_array_size(int num)
 {
 	if (num > memcg_limited_groups_array_size)
 		memcg_limited_groups_array_size = memcg_caches_array_size(num);
 }
 static void kmem_cache_destroy_work_func(struct work_struct *w);
 int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 {
 	struct memcg_cache_params *cur_params = s->memcg_params;
 	VM_BUG_ON(!is_root_cache(s));
 	if (num_groups > memcg_limited_groups_array_size) {
 		int i;
 		struct memcg_cache_params *new_params;
 		ssize_t size = memcg_caches_array_size(num_groups);
 		size *= sizeof(void *);
 		size += offsetof(struct memcg_cache_params, memcg_caches);
 		new_params = kzalloc(size, GFP_KERNEL);
 		if (!new_params)
 			return -ENOMEM;
 		new_params->is_root_cache = true;
 		/*
 		 * There is the chance it will be bigger than
 		 * memcg_limited_groups_array_size, if we failed an allocation
 		 * in a cache, in which case all caches updated before it, will
 		 * have a bigger array.
 		 *
 		 * But if that is the case, the data after
 		 * memcg_limited_groups_array_size is certainly unused
 		 */
 		for (i = 0; i < memcg_limited_groups_array_size; i++) {
 			if (!cur_params->memcg_caches[i])
 				continue;
 			new_params->memcg_caches[i] =
 						cur_params->memcg_caches[i];
 		}
 		/*
 		 * Ideally, we would wait until all caches succeed, and only
 		 * then free the old one. But this is not worth the extra
 		 * pointer per-cache we'd have to have for this.
 		 *
 		 * It is not a big deal if some caches are left with a size
 		 * bigger than the others. And all updates will reset this
 		 * anyway.
 		 */
 		rcu_assign_pointer(s->memcg_params, new_params);
 		if (cur_params)
 			kfree_rcu(cur_params, rcu_head);
 	}
 	return 0;
 }
 char *memcg_create_cache_name(struct mem_cgroup *memcg,
 			      struct kmem_cache *root_cache)
 {
 	static char *buf = NULL;
 	/*
 	 * We need a mutex here to protect the shared buffer. Since this is
 	 * expected to be called only on cache creation, we can employ the
 	 * slab_mutex for that purpose.
 	 */
 	lockdep_assert_held(&slab_mutex);
 	if (!buf) {
 		buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
 		if (!buf)
 			return NULL;
 	}
 	cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
 	return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
 			 memcg_cache_id(memcg), buf);
 }
 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
 			     struct kmem_cache *root_cache)
 {
 	size_t size;
 	if (!memcg_kmem_enabled())
 		return 0;
 	if (!memcg) {
 		size = offsetof(struct memcg_cache_params, memcg_caches);
 		size += memcg_limited_groups_array_size * sizeof(void *);
 	} else
 		size = sizeof(struct memcg_cache_params);
 	s->memcg_params = kzalloc(size, GFP_KERNEL);
 	if (!s->memcg_params)
 		return -ENOMEM;
 	if (memcg) {
 		s->memcg_params->memcg = memcg;
 		s->memcg_params->root_cache = root_cache;
 		INIT_WORK(&s->memcg_params->destroy,
 				kmem_cache_destroy_work_func);
 	} else
 		s->memcg_params->is_root_cache = true;
 	return 0;
 }
 void memcg_free_cache_params(struct kmem_cache *s)
 {
 	kfree(s->memcg_params);
 }
 void memcg_register_cache(struct kmem_cache *s)
 {
 	struct kmem_cache *root;
 	struct mem_cgroup *memcg;
 	int id;
 	if (is_root_cache(s))
 		return;
 	/*
 	 * Holding the slab_mutex assures nobody will touch the memcg_caches
 	 * array while we are modifying it.
 	 */
 	lockdep_assert_held(&slab_mutex);
 	root = s->memcg_params->root_cache;
 	memcg = s->memcg_params->memcg;
 	id = memcg_cache_id(memcg);
 	css_get(&memcg->css);
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
 	 * barrier here to ensure nobody will see the kmem_cache partially
 	 * initialized.
 	 */
 	smp_wmb();
 	/*
 	 * Initialize the pointer to this cache in its parent's memcg_params
 	 * before adding it to the memcg_slab_caches list, otherwise we can
 	 * fail to convert memcg_params_to_cache() while traversing the list.
 	 */
 	VM_BUG_ON(root->memcg_params->memcg_caches[id]);
 	root->memcg_params->memcg_caches[id] = s;
 	mutex_lock(&memcg->slab_caches_mutex);
 	list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
 	mutex_unlock(&memcg->slab_caches_mutex);
 }
 void memcg_unregister_cache(struct kmem_cache *s)
 {
 	struct kmem_cache *root;
 	struct mem_cgroup *memcg;
 	int id;
 	if (is_root_cache(s))
 		return;
 	/*
 	 * Holding the slab_mutex assures nobody will touch the memcg_caches
 	 * array while we are modifying it.
 	 */
 	lockdep_assert_held(&slab_mutex);
 	root = s->memcg_params->root_cache;
 	memcg = s->memcg_params->memcg;
 	id = memcg_cache_id(memcg);
 	mutex_lock(&memcg->slab_caches_mutex);
 	list_del(&s->memcg_params->list);
 	mutex_unlock(&memcg->slab_caches_mutex);
 	/*
 	 * Clear the pointer to this cache in its parent's memcg_params only
 	 * after removing it from the memcg_slab_caches list, otherwise we can
 	 * fail to convert memcg_params_to_cache() while traversing the list.
 	 */
 	VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
 	root->memcg_params->memcg_caches[id] = NULL;
 	css_put(&memcg->css);
 }
 /*
  * During the creation a new cache, we need to disable our accounting mechanism
  * altogether. This is true even if we are not creating, but rather just
  * enqueing new caches to be created.
  *
  * This is because that process will trigger allocations; some visible, like
  * explicit kmallocs to auxiliary data structures, name strings and internal
  * cache structures; some well concealed, like INIT_WORK() that can allocate
  * objects during debug.
  *
  * If any allocation happens during memcg_kmem_get_cache, we will recurse back
  * to it. This may not be a bounded recursion: since the first cache creation
  * failed to complete (waiting on the allocation), we'll just try to create the
  * cache again, failing at the same point.
  *
  * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
  * memcg_kmem_skip_account. So we enclose anything that might allocate memory
  * inside the following two functions.
  */
 static inline void memcg_stop_kmem_account(void)
 {
 	VM_BUG_ON(!current->mm);
 	current->memcg_kmem_skip_account++;
 }
 static inline void memcg_resume_kmem_account(void)
 {
 	VM_BUG_ON(!current->mm);
 	current->memcg_kmem_skip_account--;
 }
 static void kmem_cache_destroy_work_func(struct work_struct *w)
 {
 	struct kmem_cache *cachep;
 	struct memcg_cache_params *p;
 	p = container_of(w, struct memcg_cache_params, destroy);
 	cachep = memcg_params_to_cache(p);
 	/*
 	 * If we get down to 0 after shrink, we could delete right away.
 	 * However, memcg_release_pages() already puts us back in the workqueue
 	 * in that case. If we proceed deleting, we'll get a dangling
 	 * reference, and removing the object from the workqueue in that case
 	 * is unnecessary complication. We are not a fast path.
 	 *
 	 * Note that this case is fundamentally different from racing with
 	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
 	 * kmem_cache_shrink, not only we would be reinserting a dead cache
 	 * into the queue, but doing so from inside the worker racing to
 	 * destroy it.
 	 *
 	 * So if we aren't down to zero, we'll just schedule a worker and try
 	 * again
 	 */
 	if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
 		kmem_cache_shrink(cachep);
 	else
 		kmem_cache_destroy(cachep);
 }
 void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
 {
 	if (!cachep->memcg_params->dead)
 		return;
 	/*
 	 * There are many ways in which we can get here.
 	 *
 	 * We can get to a memory-pressure situation while the delayed work is
 	 * still pending to run. The vmscan shrinkers can then release all
 	 * cache memory and get us to destruction. If this is the case, we'll
 	 * be executed twice, which is a bug (the second time will execute over
 	 * bogus data). In this case, cancelling the work should be fine.
 	 *
 	 * But we can also get here from the worker itself, if
 	 * kmem_cache_shrink is enough to shake all the remaining objects and
 	 * get the page count to 0. In this case, we'll deadlock if we try to
 	 * cancel the work (the worker runs with an internal lock held, which
 	 * is the same lock we would hold for cancel_work_sync().)
 	 *
 	 * Since we can't possibly know who got us here, just refrain from
 	 * running if there is already work pending
 	 */
 	if (work_pending(&cachep->memcg_params->destroy))
 		return;
 	/*
 	 * We have to defer the actual destroying to a workqueue, because
 	 * we might currently be in a context that cannot sleep.
 	 */
 	schedule_work(&cachep->memcg_params->destroy);
 }
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
 {
 	struct kmem_cache *c;
 	int i;
 	if (!s->memcg_params)
 		return;
 	if (!s->memcg_params->is_root_cache)
 		return;
 	/*
 	 * If the cache is being destroyed, we trust that there is no one else
 	 * requesting objects from it. Even if there are, the sanity checks in
 	 * kmem_cache_destroy should caught this ill-case.
 	 *
 	 * Still, we don't want anyone else freeing memcg_caches under our
 	 * noses, which can happen if a new memcg comes to life. As usual,
 	 * we'll take the activate_kmem_mutex to protect ourselves against
 	 * this.
 	 */
 	mutex_lock(&activate_kmem_mutex);
 	for_each_memcg_cache_index(i) {
 		c = cache_from_memcg_idx(s, i);
 		if (!c)
 			continue;
 		/*
 		 * We will now manually delete the caches, so to avoid races
 		 * we need to cancel all pending destruction workers and
 		 * proceed with destruction ourselves.
 		 *
 		 * kmem_cache_destroy() will call kmem_cache_shrink internally,
 		 * and that could spawn the workers again: it is likely that
 		 * the cache still have active pages until this very moment.
 		 * This would lead us back to mem_cgroup_destroy_cache.
 		 *
 		 * But that will not execute at all if the "dead" flag is not
 		 * set, so flip it down to guarantee we are in control.
 		 */
 		c->memcg_params->dead = false;
 		cancel_work_sync(&c->memcg_params->destroy);
 		kmem_cache_destroy(c);
 	}
 	mutex_unlock(&activate_kmem_mutex);
 }
 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
 {
 	struct kmem_cache *cachep;
 	struct memcg_cache_params *params;
 	if (!memcg_kmem_is_active(memcg))
 		return;
 	mutex_lock(&memcg->slab_caches_mutex);
 	list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
 		cachep = memcg_params_to_cache(params);
 		cachep->memcg_params->dead = true;
 		schedule_work(&cachep->memcg_params->destroy);
 	}
 	mutex_unlock(&memcg->slab_caches_mutex);
 }
 struct create_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
 	struct work_struct work;
 };
 static void memcg_create_cache_work_func(struct work_struct *w)
 {
 	struct create_work *cw = container_of(w, struct create_work, work);
 	struct mem_cgroup *memcg = cw->memcg;
 	struct kmem_cache *cachep = cw->cachep;
-	struct kmem_cache *new;
-	new = kmem_cache_create_memcg(memcg, cachep->name,
+	kmem_cache_create_memcg(memcg, cachep);
-			cachep->object_size, cachep->align,
-			cachep->flags & ~SLAB_PANIC, cachep->ctor, cachep);
-	if (new)
-		new->allocflags |= __GFP_KMEMCG;
 	css_put(&memcg->css);
 	kfree(cw);
 }
 /*
  * Enqueue the creation of a per-memcg kmem_cache.
  */
 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
 					 struct kmem_cache *cachep)
 {
 	struct create_work *cw;
 	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
 	if (cw == NULL) {
 		css_put(&memcg->css);
 		return;
 	}
 	cw->memcg = memcg;
 	cw->cachep = cachep;
 	INIT_WORK(&cw->work, memcg_create_cache_work_func);
 	schedule_work(&cw->work);
 }
 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
 				       struct kmem_cache *cachep)
 {
 	/*
 	 * We need to stop accounting when we kmalloc, because if the
 	 * corresponding kmalloc cache is not yet created, the first allocation
 	 * in __memcg_create_cache_enqueue will recurse.
 	 *
 	 * However, it is better to enclose the whole function. Depending on
 	 * the debugging options enabled, INIT_WORK(), for instance, can
 	 * trigger an allocation. This too, will make us recurse. Because at
 	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
 	 * the safest choice is to do it like this, wrapping the whole function.
 	 */
 	memcg_stop_kmem_account();
 	__memcg_create_cache_enqueue(memcg, cachep);
 	memcg_resume_kmem_account();
 }
 /*
  * Return the kmem_cache we're supposed to use for a slab allocation.
  * We try to use the current memcg's version of the cache.
  *
  * If the cache does not exist yet, if we are the first user of it,
  * we either create it immediately, if possible, or create it asynchronously
  * in a workqueue.
  * In the latter case, we will let the current allocation go through with
  * the original cache.
  *
  * Can't be called in interrupt context or from kernel threads.
  * This function needs to be called with rcu_read_lock() held.
  */
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 					  gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *memcg_cachep;
 	VM_BUG_ON(!cachep->memcg_params);
 	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
 	if (!current->mm || current->memcg_kmem_skip_account)
 		return cachep;
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
 	if (!memcg_can_account_kmem(memcg))
 		goto out;
 	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
 	if (likely(memcg_cachep)) {
 		cachep = memcg_cachep;
 		goto out;
 	}
 	/* The corresponding put will be done in the workqueue. */
 	if (!css_tryget(&memcg->css))
 		goto out;
 	rcu_read_unlock();
 	/*
 	 * If we are in a safe context (can wait, and not in interrupt
 	 * context), we could be be predictable and return right away.
 	 * This would guarantee that the allocation being performed
 	 * already belongs in the new cache.
 	 *
 	 * However, there are some clashes that can arrive from locking.
 	 * For instance, because we acquire the slab_mutex while doing
 	 * kmem_cache_dup, this means no further allocation could happen
 	 * with the slab_mutex held.
 	 *
 	 * Also, because cache creation issue get_online_cpus(), this
 	 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
 	 * that ends up reversed during cpu hotplug. (cpuset allocates
 	 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
 	 * better to defer everything.
 	 */
 	memcg_create_cache_enqueue(memcg, cachep);
 	return cachep;
 out:
 	rcu_read_unlock();
 	return cachep;
 }
 EXPORT_SYMBOL(__memcg_kmem_get_cache);
 /*
  * We need to verify if the allocation against current->mm->owner's memcg is
  * possible for the given order. But the page is not allocated yet, so we'll
  * need a further commit step to do the final arrangements.
  *
  * It is possible for the task to switch cgroups in this mean time, so at
  * commit time, we can't rely on task conversion any longer.  We'll then use
  * the handle argument to return to the caller which cgroup we should commit
  * against. We could also return the memcg directly and avoid the pointer
  * passing, but a boolean return value gives better semantics considering
  * the compiled-out case as well.
  *
  * Returning true means the allocation is possible.
  */
 bool
 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 {
 	struct mem_cgroup *memcg;
 	int ret;
 	*_memcg = NULL;
 	/*
 	 * Disabling accounting is only relevant for some specific memcg
 	 * internal allocations. Therefore we would initially not have such
 	 * check here, since direct calls to the page allocator that are marked
 	 * with GFP_KMEMCG only happen outside memcg core. We are mostly
 	 * concerned with cache allocations, and by having this test at
 	 * memcg_kmem_get_cache, we are already able to relay the allocation to
 	 * the root cache and bypass the memcg cache altogether.
 	 *
 	 * There is one exception, though: the SLUB allocator does not create
 	 * large order caches, but rather service large kmallocs directly from
 	 * the page allocator. Therefore, the following sequence when backed by
 	 * the SLUB allocator:
 	 *
 	 *	memcg_stop_kmem_account();
 	 *	kmalloc(<large_number>)
 	 *	memcg_resume_kmem_account();
 	 *
 	 * would effectively ignore the fact that we should skip accounting,
 	 * since it will drive us directly to this function without passing
 	 * through the cache selector memcg_kmem_get_cache. Such large
 	 * allocations are extremely rare but can happen, for instance, for the
 	 * cache arrays. We bring this test here.
 	 */
 	if (!current->mm || current->memcg_kmem_skip_account)
 		return true;
 	memcg = get_mem_cgroup_from_mm(current->mm);
 	if (!memcg_can_account_kmem(memcg)) {
 		css_put(&memcg->css);
 		return true;
 	}
 	ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
 	if (!ret)
 		*_memcg = memcg;
 	css_put(&memcg->css);
 	return (ret == 0);
 }
 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
 			      int order)
 {
 	struct page_cgroup *pc;
 	VM_BUG_ON(mem_cgroup_is_root(memcg));
 	/* The page allocation failed. Revert */
 	if (!page) {
 		memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
 		return;
 	}
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	pc->mem_cgroup = memcg;
 	SetPageCgroupUsed(pc);
 	unlock_page_cgroup(pc);
 }
 void __memcg_kmem_uncharge_pages(struct page *page, int order)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	/*
 	 * Fast unlocked return. Theoretically might have changed, have to
 	 * check again after locking.
 	 */
 	if (!PageCgroupUsed(pc))
 		return;
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		ClearPageCgroupUsed(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * We trust that only if there is a memcg associated with the page, it
 	 * is a valid allocation
 	 */
 	if (!memcg)
 		return;
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
 }
 #else
 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
  * charge/uncharge will be never happen and move_account() is done under
  * compound_lock(), so we don't have to take care of races.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
 	struct page_cgroup *head_pc = lookup_page_cgroup(head);
 	struct page_cgroup *pc;
 	struct mem_cgroup *memcg;
 	int i;
 	if (mem_cgroup_disabled())
 		return;
 	memcg = head_pc->mem_cgroup;
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		pc = head_pc + i;
 		pc->mem_cgroup = memcg;
 		smp_wmb();/* see __commit_charge() */
 		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
 	}
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 		       HPAGE_PMD_NR);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 /**
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @nr_pages: number of regular pages (>1 for huge pages)
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
  * - compound_lock is held when nr_pages > 1
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
 				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to)
 {
 	unsigned long flags;
 	int ret;
 	bool anon = PageAnon(page);
 	VM_BUG_ON(from == to);
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 	/*
 	 * The page is isolated from LRU. So, collapse function
 	 * will not handle this page. But page splitting can happen.
 	 * Do this check under compound_page_lock(). The caller should
 	 * hold it.
 	 */
 	ret = -EBUSY;
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 	lock_page_cgroup(pc);
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
 		goto unlock;
 	move_lock_mem_cgroup(from, &flags);
 	if (!anon && page_mapped(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 	}
 	if (PageWriteback(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
 			       nr_pages);
 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
 			       nr_pages);
 	}
 	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	mem_cgroup_charge_statistics(to, page, anon, nr_pages);
 	move_unlock_mem_cgroup(from, &flags);
 	ret = 0;
 unlock:
 	unlock_page_cgroup(pc);
 	/*
 	 * check events
 	 */
 	memcg_check_events(to, page);
 	memcg_check_events(from, page);
 out:
 	return ret;
 }
 /**
  * mem_cgroup_move_parent - moves page to the parent group
  * @page: the page to move
  * @pc: page_cgroup of the page
  * @child: page's cgroup
  *
  * move charges to its parent or the root cgroup if the group has no
  * parent (aka use_hierarchy==0).
  * Although this might fail (get_page_unless_zero, isolate_lru_page or
  * mem_cgroup_move_account fails) the failure is always temporary and
  * it signals a race with a page removal/uncharge or migration. In the
  * first case the page is on the way out and it will vanish from the LRU
  * on the next attempt and the call should be retried later.
  * Isolation from the LRU fails only if page has been isolated from
  * the LRU since we looked at it and that usually means either global
  * reclaim or migration going on. The page will either get back to the
  * LRU or vanish.
  * Finaly mem_cgroup_move_account fails only if the page got uncharged
  * (!PageCgroupUsed) or moved to a different group. The page will
  * disappear in the next attempt.
  */
 static int mem_cgroup_move_parent(struct page *page,
 				  struct page_cgroup *pc,
 				  struct mem_cgroup *child)
 {
 	struct mem_cgroup *parent;
 	unsigned int nr_pages;
 	unsigned long uninitialized_var(flags);
 	int ret;
 	VM_BUG_ON(mem_cgroup_is_root(child));
 	ret = -EBUSY;
 	if (!get_page_unless_zero(page))
 		goto out;
 	if (isolate_lru_page(page))
 		goto put;
 	nr_pages = hpage_nr_pages(page);
 	parent = parent_mem_cgroup(child);
 	/*
 	 * If no parent, move charges to root cgroup.
 	 */
 	if (!parent)
 		parent = root_mem_cgroup;
 	if (nr_pages > 1) {
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 		flags = compound_lock_irqsave(page);
 	}
 	ret = mem_cgroup_move_account(page, nr_pages,
 				pc, child, parent);
 	if (!ret)
 		__mem_cgroup_cancel_local_charge(child, nr_pages);
 	if (nr_pages > 1)
 		compound_unlock_irqrestore(page, flags);
 	putback_lru_page(page);
 put:
 	put_page(page);
 out:
 	return ret;
 }
 int mem_cgroup_charge_anon(struct page *page,
 			      struct mm_struct *mm, gfp_t gfp_mask)
 {
 	unsigned int nr_pages = 1;
 	struct mem_cgroup *memcg;
 	bool oom = true;
 	if (mem_cgroup_disabled())
 		return 0;
 	VM_BUG_ON_PAGE(page_mapped(page), page);
 	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
 	VM_BUG_ON(!mm);
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 		/*
 		 * Never OOM-kill a process for a huge page.  The
 		 * fault handler will fall back to regular pages.
 		 */
 		oom = false;
 	}
 	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
 	if (!memcg)
 		return -ENOMEM;
 	__mem_cgroup_commit_charge(memcg, page, nr_pages,
 				   MEM_CGROUP_CHARGE_TYPE_ANON, false);
 	return 0;
 }
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
  * And when try_charge() successfully returns, one refcnt to memcg without
  * struct page_cgroup is acquired. This refcnt will be consumed by
  * "commit()" or removed by "cancel()"
  */
 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
 					  struct page *page,
 					  gfp_t mask,
 					  struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	int ret;
 	pc = lookup_page_cgroup(page);
 	/*
 	 * Every swap fault against a single page tries to charge the
 	 * page, bail as early as possible.  shmem_unuse() encounters
 	 * already charged pages, too.  The USED bit is protected by
 	 * the page lock, which serializes swap cache removal, which
 	 * in turn serializes uncharging.
 	 */
 	if (PageCgroupUsed(pc))
 		goto out;
 	if (do_swap_account)
 		memcg = try_get_mem_cgroup_from_page(page);
 	if (!memcg)
 		memcg = get_mem_cgroup_from_mm(mm);
 	ret = mem_cgroup_try_charge(memcg, mask, 1, true);
 	css_put(&memcg->css);
 	if (ret == -EINTR)
 		memcg = root_mem_cgroup;
 	else if (ret)
 		return ret;
 out:
 	*memcgp = memcg;
 	return 0;
 }
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
 				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
 {
 	if (mem_cgroup_disabled()) {
 		*memcgp = NULL;
 		return 0;
 	}
 	/*
 	 * A racing thread's fault, or swapoff, may have already
 	 * updated the pte, and even removed page from swap cache: in
 	 * those cases unuse_pte()'s pte_same() test will fail; but
 	 * there's also a KSM case which does need to charge the page.
 	 */
 	if (!PageSwapCache(page)) {
 		struct mem_cgroup *memcg;
 		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
 		if (!memcg)
 			return -ENOMEM;
 		*memcgp = memcg;
 		return 0;
 	}
 	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
 }
 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!memcg)
 		return;
 	__mem_cgroup_cancel_charge(memcg, 1);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
 					enum charge_type ctype)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!memcg)
 		return;
 	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
 	/*
 	 * Now swap is on-memory. This means this page may be
 	 * counted both as mem and swap....double count.
 	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
 	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
 	 * may call delete_from_swap_cache() before reach here.
 	 */
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t ent = {.val = page_private(page)};
 		mem_cgroup_uncharge_swap(ent);
 	}
 }
 void mem_cgroup_commit_charge_swapin(struct page *page,
 				     struct mem_cgroup *memcg)
 {
 	__mem_cgroup_commit_charge_swapin(page, memcg,
 					  MEM_CGROUP_CHARGE_TYPE_ANON);
 }
 int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
 				gfp_t gfp_mask)
 {
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	struct mem_cgroup *memcg;
 	int ret;
 	if (mem_cgroup_disabled())
 		return 0;
 	if (PageCompound(page))
 		return 0;
 	if (PageSwapCache(page)) { /* shmem */
 		ret = __mem_cgroup_try_charge_swapin(mm, page,
 						     gfp_mask, &memcg);
 		if (ret)
 			return ret;
 		__mem_cgroup_commit_charge_swapin(page, memcg, type);
 		return 0;
 	}
 	/*
 	 * Page cache insertions can happen without an actual mm
 	 * context, e.g. during disk probing on boot.
 	 */
 	if (unlikely(!mm))
 		memcg = root_mem_cgroup;
 	else {
 		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
 		if (!memcg)
 			return -ENOMEM;
 	}
 	__mem_cgroup_commit_charge(memcg, page, 1, type, false);
 	return 0;
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
 				   unsigned int nr_pages,
 				   const enum charge_type ctype)
 {
 	struct memcg_batch_info *batch = NULL;
 	bool uncharge_memsw = true;
 	/* If swapout, usage of swap doesn't decrease */
 	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		uncharge_memsw = false;
 	batch = &current->memcg_batch;
 	/*
 	 * In usual, we do css_get() when we remember memcg pointer.
 	 * But in this case, we keep res->usage until end of a series of
 	 * uncharges. Then, it's ok to ignore memcg's refcnt.
 	 */
 	if (!batch->memcg)
 		batch->memcg = memcg;
 	/*
 	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
 	 * In those cases, all pages freed continuously can be expected to be in
 	 * the same cgroup and we have chance to coalesce uncharges.
 	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
 	 * because we want to do uncharge as soon as possible.
 	 */
 	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
 		goto direct_uncharge;
 	if (nr_pages > 1)
 		goto direct_uncharge;
 	/*
 	 * In typical case, batch->memcg == mem. This means we can
 	 * merge a series of uncharges to an uncharge of res_counter.
 	 * If not, we uncharge res_counter ony by one.
 	 */
 	if (batch->memcg != memcg)
 		goto direct_uncharge;
 	/* remember freed charge and uncharge it later */
 	batch->nr_pages++;
 	if (uncharge_memsw)
 		batch->memsw_nr_pages++;
 	return;
 direct_uncharge:
 	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
 	if (uncharge_memsw)
 		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
 	if (unlikely(batch->memcg != memcg))
 		memcg_oom_recover(memcg);
 }
 /*
  * uncharge if !page_mapped(page)
  */
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
 			     bool end_migration)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	bool anon;
 	if (mem_cgroup_disabled())
 		return NULL;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 	}
 	/*
 	 * Check if our page_cgroup is valid
 	 */
 	pc = lookup_page_cgroup(page);
 	if (unlikely(!PageCgroupUsed(pc)))
 		return NULL;
 	lock_page_cgroup(pc);
 	memcg = pc->mem_cgroup;
 	if (!PageCgroupUsed(pc))
 		goto unlock_out;
 	anon = PageAnon(page);
 	switch (ctype) {
 	case MEM_CGROUP_CHARGE_TYPE_ANON:
 		/*
 		 * Generally PageAnon tells if it's the anon statistics to be
 		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
 		 * used before page reached the stage of being marked PageAnon.
 		 */
 		anon = true;
 		/* fallthrough */
 	case MEM_CGROUP_CHARGE_TYPE_DROP:
 		/* See mem_cgroup_prepare_migration() */
 		if (page_mapped(page))
 			goto unlock_out;
 		/*
 		 * Pages under migration may not be uncharged.  But
 		 * end_migration() /must/ be the one uncharging the
 		 * unused post-migration page and so it has to call
 		 * here with the migration bit still set.  See the
 		 * res_counter handling below.
 		 */
 		if (!end_migration && PageCgroupMigration(pc))
 			goto unlock_out;
 		break;
 	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
 		if (!PageAnon(page)) {	/* Shared memory */
 			if (page->mapping && !page_is_file_cache(page))
 				goto unlock_out;
 		} else if (page_mapped(page)) /* Anon */
 				goto unlock_out;
 		break;
 	default:
 		break;
 	}
 	mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
 	ClearPageCgroupUsed(pc);
 	/*
 	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
 	 * freed from LRU. This is safe because uncharged page is expected not
 	 * to be reused (freed soon). Exception is SwapCache, it's handled by
 	 * special functions.
 	 */
 	unlock_page_cgroup(pc);
 	/*
 	 * even after unlock, we have memcg->res.usage here and this memcg
 	 * will never be freed, so it's safe to call css_get().
 	 */
 	memcg_check_events(memcg, page);
 	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
 		mem_cgroup_swap_statistics(memcg, true);
 		css_get(&memcg->css);
 	}
 	/*
 	 * Migration does not charge the res_counter for the
 	 * replacement page, so leave it alone when phasing out the
 	 * page that is unused after the migration.
 	 */
 	if (!end_migration && !mem_cgroup_is_root(memcg))
 		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
 	return memcg;
 unlock_out:
 	unlock_page_cgroup(pc);
 	return NULL;
 }
 void mem_cgroup_uncharge_page(struct page *page)
 {
 	/* early check. */
 	if (page_mapped(page))
 		return;
 	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
 	/*
 	 * If the page is in swap cache, uncharge should be deferred
 	 * to the swap path, which also properly accounts swap usage
 	 * and handles memcg lifetime.
 	 *
 	 * Note that this check is not stable and reclaim may add the
 	 * page to swap cache at any time after this.  However, if the
 	 * page is not in swap cache by the time page->mapcount hits
 	 * 0, there won't be any page table references to the swap
 	 * slot, and reclaim will free it and not actually write the
 	 * page to disk.
 	 */
 	if (PageSwapCache(page))
 		return;
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
 	VM_BUG_ON_PAGE(page_mapped(page), page);
 	VM_BUG_ON_PAGE(page->mapping, page);
 	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
 /*
  * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
  * In that cases, pages are freed continuously and we can expect pages
  * are in the same memcg. All these calls itself limits the number of
  * pages freed at once, then uncharge_start/end() is called properly.
  * This may be called prural(2) times in a context,
  */
 void mem_cgroup_uncharge_start(void)
 {
 	current->memcg_batch.do_batch++;
 	/* We can do nest. */
 	if (current->memcg_batch.do_batch == 1) {
 		current->memcg_batch.memcg = NULL;
 		current->memcg_batch.nr_pages = 0;
 		current->memcg_batch.memsw_nr_pages = 0;
 	}
 }
 void mem_cgroup_uncharge_end(void)
 {
 	struct memcg_batch_info *batch = &current->memcg_batch;
 	if (!batch->do_batch)
 		return;
 	batch->do_batch--;
 	if (batch->do_batch) /* If stacked, do nothing. */
 		return;
 	if (!batch->memcg)
 		return;
 	/*
 	 * This "batch->memcg" is valid without any css_get/put etc...
 	 * bacause we hide charges behind us.
 	 */
 	if (batch->nr_pages)
 		res_counter_uncharge(&batch->memcg->res,
 				     batch->nr_pages * PAGE_SIZE);
 	if (batch->memsw_nr_pages)
 		res_counter_uncharge(&batch->memcg->memsw,
 				     batch->memsw_nr_pages * PAGE_SIZE);
 	memcg_oom_recover(batch->memcg);
 	/* forget this pointer (for sanity check) */
 	batch->memcg = NULL;
 }
 #ifdef CONFIG_SWAP
 /*
  * called after __delete_from_swap_cache() and drop "page" account.
  * memcg information is recorded to swap_cgroup of "ent"
  */
 void
 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 {
 	struct mem_cgroup *memcg;
 	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
 	if (!swapout) /* this was a swap cache but the swap is unused ! */
 		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
 	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
 	/*
 	 * record memcg information,  if swapout && memcg != NULL,
 	 * css_get() was called in uncharge().
 	 */
 	if (do_swap_account && swapout && memcg)
 		swap_cgroup_record(ent, mem_cgroup_id(memcg));
 }
 #endif
 #ifdef CONFIG_MEMCG_SWAP
 /*
  * called from swap_entry_free(). remove record in swap_cgroup and
  * uncharge "memsw" account.
  */
 void mem_cgroup_uncharge_swap(swp_entry_t ent)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
 	if (!do_swap_account)
 		return;
 	id = swap_cgroup_record(ent, 0);
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
 		/*
 		 * We uncharge this because swap is freed.
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
 		if (!mem_cgroup_is_root(memcg))
 			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
 		css_put(&memcg->css);
 	}
 	rcu_read_unlock();
 }
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
  *
  * Returns 0 on success, -EINVAL on failure.
  *
  * The caller must have charged to @to, IOW, called res_counter_charge() about
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	unsigned short old_id, new_id;
 	old_id = mem_cgroup_id(from);
 	new_id = mem_cgroup_id(to);
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
 		mem_cgroup_swap_statistics(from, false);
 		mem_cgroup_swap_statistics(to, true);
 		/*
 		 * This function is only called from task migration context now.
 		 * It postpones res_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
 		 * improvement. But we cannot postpone css_get(to)  because if
 		 * the process that has been moved to @to does swap-in, the
 		 * refcount of @to might be decreased to 0.
 		 *
 		 * We are in attach() phase, so the cgroup is guaranteed to be
 		 * alive, so we can just call css_get().
 		 */
 		css_get(&to->css);
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	return -EINVAL;
 }
 #endif
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
  */
 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
 				  struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	struct page_cgroup *pc;
 	enum charge_type ctype;
 	*memcgp = NULL;
 	if (mem_cgroup_disabled())
 		return;
 	if (PageTransHuge(page))
 		nr_pages <<= compound_order(page);
 	pc = lookup_page_cgroup(page);
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		css_get(&memcg->css);
 		/*
 		 * At migrating an anonymous page, its mapcount goes down
 		 * to 0 and uncharge() will be called. But, even if it's fully
 		 * unmapped, migration may fail and this page has to be
 		 * charged again. We set MIGRATION flag here and delay uncharge
 		 * until end_migration() is called
 		 *
 		 * Corner Case Thinking
 		 * A)
 		 * When the old page was mapped as Anon and it's unmap-and-freed
 		 * while migration was ongoing.
 		 * If unmap finds the old page, uncharge() of it will be delayed
 		 * until end_migration(). If unmap finds a new page, it's
 		 * uncharged when it make mapcount to be 1->0. If unmap code
 		 * finds swap_migration_entry, the new page will not be mapped
 		 * and end_migration() will find it(mapcount==0).
 		 *
 		 * B)
 		 * When the old page was mapped but migraion fails, the kernel
 		 * remaps it. A charge for it is kept by MIGRATION flag even
 		 * if mapcount goes down to 0. We can do remap successfully
 		 * without charging it again.
 		 *
 		 * C)
 		 * The "old" page is under lock_page() until the end of
 		 * migration, so, the old page itself will not be swapped-out.
 		 * If the new page is swapped out before end_migraton, our
 		 * hook to usual swap-out path will catch the event.
 		 */
 		if (PageAnon(page))
 			SetPageCgroupMigration(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * If the page is not charged at this point,
 	 * we return here.
 	 */
 	if (!memcg)
 		return;
 	*memcgp = memcg;
 	/*
 	 * We charge new page before it's used/mapped. So, even if unlock_page()
 	 * is called before end_migration, we can catch all events on this new
 	 * page. In the case new page is migrated but not remapped, new page's
 	 * mapcount will be finally 0 and we call uncharge in end_migration().
 	 */
 	if (PageAnon(page))
 		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
 	else
 		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	/*
 	 * The page is committed to the memcg, but it's not actually
 	 * charged to the res_counter since we plan on replacing the
 	 * old one and only one page is going to be left afterwards.
 	 */
 	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 	struct page *oldpage, struct page *newpage, bool migration_ok)
 {
 	struct page *used, *unused;
 	struct page_cgroup *pc;
 	bool anon;
 	if (!memcg)
 		return;
 	if (!migration_ok) {
 		used = oldpage;
 		unused = newpage;
 	} else {
 		used = newpage;
 		unused = oldpage;
 	}
 	anon = PageAnon(used);
 	__mem_cgroup_uncharge_common(unused,
 				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
 				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
 				     true);
 	css_put(&memcg->css);
 	/*
 	 * We disallowed uncharge of pages under migration because mapcount
 	 * of the page goes down to zero, temporarly.
 	 * Clear the flag and check the page should be charged.
 	 */
 	pc = lookup_page_cgroup(oldpage);
 	lock_page_cgroup(pc);
 	ClearPageCgroupMigration(pc);
 	unlock_page_cgroup(pc);
 	/*
 	 * If a page is a file cache, radix-tree replacement is very atomic
 	 * and we can skip this check. When it was an Anon page, its mapcount
 	 * goes down to 0. But because we added MIGRATION flage, it's not
 	 * uncharged yet. There are several case but page->mapcount check
 	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
 	 * check. (see prepare_charge() also)
 	 */
 	if (anon)
 		mem_cgroup_uncharge_page(used);
 }
 /*
  * At replace page cache, newpage is not under any memcg but it's on
  * LRU. So, this function doesn't touch res_counter but handles LRU
  * in correct way. Both pages are locked so we cannot race with uncharge.
  */
 void mem_cgroup_replace_page_cache(struct page *oldpage,
 				  struct page *newpage)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 	if (mem_cgroup_disabled())
 		return;
 	pc = lookup_page_cgroup(oldpage);
 	/* fix accounting on old pages */
 	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
 		ClearPageCgroupUsed(pc);
 	}
 	unlock_page_cgroup(pc);
 	/*
 	 * When called from shmem_replace_page(), in some cases the
 	 * oldpage has already been charged, and in some cases not.
 	 */
 	if (!memcg)
 		return;
 	/*
 	 * Even if newpage->mapping was NULL before starting replacement,
 	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
 	 * LRU while we overwrite pc->mem_cgroup.
 	 */
 	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
 }
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	/*
 	 * Can be NULL while feeding pages into the page allocator for
 	 * the first time, i.e. during boot or memory hotplug;
 	 * or when mem_cgroup_disabled().
 	 */
 	if (likely(pc) && PageCgroupUsed(pc))
 		return pc;
 	return NULL;
 }
 bool mem_cgroup_bad_page_check(struct page *page)
 {
 	if (mem_cgroup_disabled())
 		return false;
 	return lookup_page_cgroup_used(page) != NULL;
 }
 void mem_cgroup_print_bad_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup_used(page);
 	if (pc) {
 		pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
 			 pc, pc->flags, pc->mem_cgroup);
 	}
 }
 #endif
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				unsigned long long val)
 {
 	int retry_count;
 	u64 memswlimit, memlimit;
 	int ret = 0;
 	int children = mem_cgroup_count_children(memcg);
 	u64 curusage, oldusage;
 	int enlarge;
 	/*
 	 * For keeping hierarchical_reclaim simple, how long we should retry
 	 * is depends on callers. We set our retry-count to be function
 	 * of # of children which we should visit in this loop.
 	 */
 	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
 	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 	enlarge = 0;
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->res, val);
 		if (!ret) {
 			if (memswlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_reclaim(memcg, GFP_KERNEL,
 				   MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 					unsigned long long val)
 {
 	int retry_count;
 	u64 memlimit, memswlimit, oldusage, curusage;
 	int children = mem_cgroup_count_children(memcg);
 	int ret = -EBUSY;
 	int enlarge = 0;
 	/* see mem_cgroup_resize_res_limit */
 	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
 	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	while (retry_count) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		/*
 		 * Rather than hide all in some function, I do this in
 		 * open coded manner. You see what this really does.
 		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
 		 */
 		mutex_lock(&set_limit_mutex);
 		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		if (memlimit > val) {
 			ret = -EINVAL;
 			mutex_unlock(&set_limit_mutex);
 			break;
 		}
 		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		if (memswlimit < val)
 			enlarge = 1;
 		ret = res_counter_set_limit(&memcg->memsw, val);
 		if (!ret) {
 			if (memlimit == val)
 				memcg->memsw_is_minimum = true;
 			else
 				memcg->memsw_is_minimum = false;
 		}
 		mutex_unlock(&set_limit_mutex);
 		if (!ret)
 			break;
 		mem_cgroup_reclaim(memcg, GFP_KERNEL,
 				   MEM_CGROUP_RECLAIM_NOSWAP |
 				   MEM_CGROUP_RECLAIM_SHRINK);
 		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	}
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
 {
 	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
 	unsigned long reclaimed;
 	int loop = 0;
 	struct mem_cgroup_tree_per_zone *mctz;
 	unsigned long long excess;
 	unsigned long nr_scanned;
 	if (order > 0)
 		return 0;
 	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
 	 * pressure
 	 */
 	do {
 		if (next_mz)
 			mz = next_mz;
 		else
 			mz = mem_cgroup_largest_soft_limit_node(mctz);
 		if (!mz)
 			break;
 		nr_scanned = 0;
 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
 						    gfp_mask, &nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
 		spin_lock(&mctz->lock);
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
 		 * it is time to move on to the next cgroup
 		 */
 		next_mz = NULL;
 		if (!reclaimed) {
 			do {
 				/*
 				 * Loop until we find yet another one.
 				 *
 				 * By the time we get the soft_limit lock
 				 * again, someone might have aded the
 				 * group back on the RB tree. Iterate to
 				 * make sure we get a different mem.
 				 * mem_cgroup_largest_soft_limit_node returns
 				 * NULL if no other cgroup is present on
 				 * the tree
 				 */
 				next_mz =
 				__mem_cgroup_largest_soft_limit_node(mctz);
 				if (next_mz == mz)
 					css_put(&next_mz->memcg->css);
 				else /* next_mz == NULL or other memcg */
 					break;
 			} while (1);
 		}
 		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
 		excess = res_counter_soft_limit_excess(&mz->memcg->res);
 		/*
 		 * One school of thought says that we should not add
 		 * back the node to the tree if reclaim returns 0.
 		 * But our reclaim could return 0, simply because due
 		 * to priority we are exposing a smaller subset of
 		 * memory to reclaim from. Consider this as a longer
 		 * term TODO.
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
 		spin_unlock(&mctz->lock);
 		css_put(&mz->memcg->css);
 		loop++;
 		/*
 		 * Could not reclaim anything and there are no more
 		 * mem cgroups to try or we seem to be looping without
 		 * reclaiming anything.
 		 */
 		if (!nr_reclaimed &&
 			(next_mz == NULL ||
 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
 			break;
 	} while (!nr_reclaimed);
 	if (next_mz)
 		css_put(&next_mz->memcg->css);
 	return nr_reclaimed;
 }
 /**
  * mem_cgroup_force_empty_list - clears LRU of a group
  * @memcg: group to clear
  * @node: NUMA node
  * @zid: zone id
  * @lru: lru to to clear
  *
  * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
  * reclaim the pages page themselves - pages are moved to the parent (or root)
  * group.
  */
 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
 	struct lruvec *lruvec;
 	unsigned long flags;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 	zone = &NODE_DATA(node)->node_zones[zid];
 	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 	list = &lruvec->lists[lru];
 	busy = NULL;
 	do {
 		struct page_cgroup *pc;
 		struct page *page;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (list_empty(list)) {
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			break;
 		}
 		page = list_entry(list->prev, struct page, lru);
 		if (busy == page) {
 			list_move(&page->lru, list);
 			busy = NULL;
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 		pc = lookup_page_cgroup(page);
 		if (mem_cgroup_move_parent(page, pc, memcg)) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = page;
 			cond_resched();
 		} else
 			busy = NULL;
 	} while (!list_empty(list));
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task by moving
  * all the charges and pages to the parent.
  * This enables deleting this mem_cgroup.
  *
  * Caller is responsible for holding css reference on the memcg.
  */
 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 {
 	int node, zid;
 	u64 usage;
 	do {
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		drain_all_stock_sync(memcg);
 		mem_cgroup_start_move(memcg);
 		for_each_node_state(node, N_MEMORY) {
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				enum lru_list lru;
 				for_each_lru(lru) {
 					mem_cgroup_force_empty_list(memcg,
 							node, zid, lru);
 				}
 			}
 		}
 		mem_cgroup_end_move(memcg);
 		memcg_oom_recover(memcg);
 		cond_resched();
 		/*
 		 * Kernel memory may not necessarily be trackable to a specific
 		 * process. So they are not migrated, and therefore we can't
 		 * expect their value to drop to 0 here.
 		 * Having res filled up with kmem only is enough.
 		 *
 		 * This is a safety check because mem_cgroup_force_empty_list
 		 * could have raced with mem_cgroup_replace_page_cache callers
 		 * so the lru seemed empty but the page could have been added
 		 * right after the check. RES_USAGE should be safe as we always
 		 * charge before adding to the LRU.
 		 */
 		usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
 			res_counter_read_u64(&memcg->kmem, RES_USAGE);
 	} while (usage > 0);
 }
 static inline bool memcg_has_children(struct mem_cgroup *memcg)
 {
 	lockdep_assert_held(&memcg_create_mutex);
 	/*
 	 * The lock does not prevent addition or deletion to the list
 	 * of children, but it prevents a new child from being
 	 * initialized based on this parent in css_online(), so it's
 	 * enough to decide whether hierarchically inherited
 	 * attributes can still be changed or not.
 	 */
 	return memcg->use_hierarchy &&
 		!list_empty(&memcg->css.cgroup->children);
 }
 /*
  * Reclaims as many pages from the given memcg as possible and moves
  * the rest to the parent.
  *
  * Caller is responsible for holding css reference for memcg.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 {
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct cgroup *cgrp = memcg->css.cgroup;
 	/* returns EBUSY if there is a task or if we come here twice. */
 	if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
 		return -EBUSY;
 	/* we call try-to-free pages for make this cgroup empty */
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
 		int progress;
 		if (signal_pending(current))
 			return -EINTR;
 		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
 						false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 		}
 	}
 	lru_add_drain();
 	mem_cgroup_reparent_charges(memcg);
 	return 0;
 }
 static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
 					unsigned int event)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	if (mem_cgroup_is_root(memcg))
 		return -EINVAL;
 	return mem_cgroup_force_empty(memcg);
 }
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
 				     struct cftype *cft)
 {
 	return mem_cgroup_from_css(css)->use_hierarchy;
 }
 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, u64 val)
 {
 	int retval = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
 	mutex_lock(&memcg_create_mutex);
 	if (memcg->use_hierarchy == val)
 		goto out;
 	/*
 	 * If parent's use_hierarchy is set, we can't make any modifications
 	 * in the child subtrees. If it is unset, then the change can
 	 * occur, provided the current cgroup has no children.
 	 *
 	 * For the root cgroup, parent_mem is NULL, we allow value to be
 	 * set if there are no children.
 	 */
 	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
 				(val == 1 || val == 0)) {
 		if (list_empty(&memcg->css.cgroup->children))
 			memcg->use_hierarchy = val;
 		else
 			retval = -EBUSY;
 	} else
 		retval = -EINVAL;
 out:
 	mutex_unlock(&memcg_create_mutex);
 	return retval;
 }
 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
 					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
 	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, memcg)
 		val += mem_cgroup_read_stat(iter, idx);
 	if (val < 0) /* race ? */
 		val = 0;
 	return val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
 	if (!mem_cgroup_is_root(memcg)) {
 		if (!swap)
 			return res_counter_read_u64(&memcg->res, RES_USAGE);
 		else
 			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
 	}
 	/*
 	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
 	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
 	 */
 	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
 	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
 	if (swap)
 		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
 	return val << PAGE_SHIFT;
 }
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 				   struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	u64 val;
 	int name;
 	enum res_type type;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	switch (type) {
 	case _MEM:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(memcg, false);
 		else
 			val = res_counter_read_u64(&memcg->res, name);
 		break;
 	case _MEMSWAP:
 		if (name == RES_USAGE)
 			val = mem_cgroup_usage(memcg, true);
 		else
 			val = res_counter_read_u64(&memcg->memsw, name);
 		break;
 	case _KMEM:
 		val = res_counter_read_u64(&memcg->kmem, name);
 		break;
 	default:
 		BUG();
 	}
 	return val;
 }
 #ifdef CONFIG_MEMCG_KMEM
 /* should be called with activate_kmem_mutex held */
 static int __memcg_activate_kmem(struct mem_cgroup *memcg,
 				 unsigned long long limit)
 {
 	int err = 0;
 	int memcg_id;
 	if (memcg_kmem_is_active(memcg))
 		return 0;
 	/*
 	 * We are going to allocate memory for data shared by all memory
 	 * cgroups so let's stop accounting here.
 	 */
 	memcg_stop_kmem_account();
 	/*
 	 * For simplicity, we won't allow this to be disabled.  It also can't
 	 * be changed if the cgroup has children already, or if tasks had
 	 * already joined.
 	 *
 	 * If tasks join before we set the limit, a person looking at
 	 * kmem.usage_in_bytes will have no way to determine when it took
 	 * place, which makes the value quite meaningless.
 	 *
 	 * After it first became limited, changes in the value of the limit are
 	 * of course permitted.
 	 */
 	mutex_lock(&memcg_create_mutex);
 	if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
 		err = -EBUSY;
 	mutex_unlock(&memcg_create_mutex);
 	if (err)
 		goto out;
 	memcg_id = ida_simple_get(&kmem_limited_groups,
 				  0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
 	if (memcg_id < 0) {
 		err = memcg_id;
 		goto out;
 	}
 	/*
 	 * Make sure we have enough space for this cgroup in each root cache's
 	 * memcg_params.
 	 */
 	err = memcg_update_all_caches(memcg_id + 1);
 	if (err)
 		goto out_rmid;
 	memcg->kmemcg_id = memcg_id;
 	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
 	mutex_init(&memcg->slab_caches_mutex);
 	/*
 	 * We couldn't have accounted to this cgroup, because it hasn't got the
 	 * active bit set yet, so this should succeed.
 	 */
 	err = res_counter_set_limit(&memcg->kmem, limit);
 	VM_BUG_ON(err);
 	static_key_slow_inc(&memcg_kmem_enabled_key);
 	/*
 	 * Setting the active bit after enabling static branching will
 	 * guarantee no one starts accounting before all call sites are
 	 * patched.
 	 */
 	memcg_kmem_set_active(memcg);
 out:
 	memcg_resume_kmem_account();
 	return err;
 out_rmid:
 	ida_simple_remove(&kmem_limited_groups, memcg_id);
 	goto out;
 }
 static int memcg_activate_kmem(struct mem_cgroup *memcg,
 			       unsigned long long limit)
 {
 	int ret;
 	mutex_lock(&activate_kmem_mutex);
 	ret = __memcg_activate_kmem(memcg, limit);
 	mutex_unlock(&activate_kmem_mutex);
 	return ret;
 }
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long long val)
 {
 	int ret;
 	if (!memcg_kmem_is_active(memcg))
 		ret = memcg_activate_kmem(memcg, val);
 	else
 		ret = res_counter_set_limit(&memcg->kmem, val);
 	return ret;
 }
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
 	int ret = 0;
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 	if (!parent)
 		return 0;
 	mutex_lock(&activate_kmem_mutex);
 	/*
 	 * If the parent cgroup is not kmem-active now, it cannot be activated
 	 * after this point, because it has at least one child already.
 	 */
 	if (memcg_kmem_is_active(parent))
 		ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
 	mutex_unlock(&activate_kmem_mutex);
 	return ret;
 }
 #else
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long long val)
 {
 	return -EINVAL;
 }
 #endif /* CONFIG_MEMCG_KMEM */
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
 			    char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	enum res_type type;
 	int name;
 	unsigned long long val;
 	int ret;
 	type = MEMFILE_TYPE(cft->private);
 	name = MEMFILE_ATTR(cft->private);
 	switch (name) {
 	case RES_LIMIT:
 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
 			ret = -EINVAL;
 			break;
 		}
 		/* This function does all necessary parse...reuse it */
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		if (type == _MEM)
 			ret = mem_cgroup_resize_limit(memcg, val);
 		else if (type == _MEMSWAP)
 			ret = mem_cgroup_resize_memsw_limit(memcg, val);
 		else if (type == _KMEM)
 			ret = memcg_update_kmem_limit(memcg, val);
 		else
 			return -EINVAL;
 		break;
 	case RES_SOFT_LIMIT:
 		ret = res_counter_memparse_write_strategy(buffer, &val);
 		if (ret)
 			break;
 		/*
 		 * For memsw, soft limits are hard to implement in terms
 		 * of semantics, for now, we support soft limits for
 		 * control without swap
 		 */
 		if (type == _MEM)
 			ret = res_counter_set_soft_limit(&memcg->res, val);
 		else
 			ret = -EINVAL;
 		break;
 	default:
 		ret = -EINVAL; /* should be BUG() ? */
 		break;
 	}
 	return ret;
 }
 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
 		unsigned long long *mem_limit, unsigned long long *memsw_limit)
 {
 	unsigned long long min_limit, min_memsw_limit, tmp;
 	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
 	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 	if (!memcg->use_hierarchy)
 		goto out;
 	while (css_parent(&memcg->css)) {
 		memcg = mem_cgroup_from_css(css_parent(&memcg->css));
 		if (!memcg->use_hierarchy)
 			break;
 		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
 		min_limit = min(min_limit, tmp);
 		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
 		min_memsw_limit = min(min_memsw_limit, tmp);
 	}
 out:
 	*mem_limit = min_limit;
 	*memsw_limit = min_memsw_limit;
 }
 static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	int name;
 	enum res_type type;
 	type = MEMFILE_TYPE(event);
 	name = MEMFILE_ATTR(event);
 	switch (name) {
 	case RES_MAX_USAGE:
 		if (type == _MEM)
 			res_counter_reset_max(&memcg->res);
 		else if (type == _MEMSWAP)
 			res_counter_reset_max(&memcg->memsw);
 		else if (type == _KMEM)
 			res_counter_reset_max(&memcg->kmem);
 		else
 			return -EINVAL;
 		break;
 	case RES_FAILCNT:
 		if (type == _MEM)
 			res_counter_reset_failcnt(&memcg->res);
 		else if (type == _MEMSWAP)
 			res_counter_reset_failcnt(&memcg->memsw);
 		else if (type == _KMEM)
 			res_counter_reset_failcnt(&memcg->kmem);
 		else
 			return -EINVAL;
 		break;
 	}
 	return 0;
 }
 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 					struct cftype *cft)
 {
 	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
 }
 #ifdef CONFIG_MMU
 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 					struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	if (val >= (1 << NR_MOVE_TYPE))
 		return -EINVAL;
 	/*
 	 * No kind of locking is needed in here, because ->can_attach() will
 	 * check this value once in the beginning of the process, and then carry
 	 * on with stale data. This means that changes to this value will only
 	 * affect task migrations starting after the change.
 	 */
 	memcg->move_charge_at_immigrate = val;
 	return 0;
 }
 #else
 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 					struct cftype *cft, u64 val)
 {
 	return -ENOSYS;
 }
 #endif
 #ifdef CONFIG_NUMA
 static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
 	struct numa_stat {
 		const char *name;
 		unsigned int lru_mask;
 	};
 	static const struct numa_stat stats[] = {
 		{ "total", LRU_ALL },
 		{ "file", LRU_ALL_FILE },
 		{ "anon", LRU_ALL_ANON },
 		{ "unevictable", BIT(LRU_UNEVICTABLE) },
 	};
 	const struct numa_stat *stat;
 	int nid;
 	unsigned long nr;
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
 		seq_printf(m, "%s=%lu", stat->name, nr);
 		for_each_node_state(nid, N_MEMORY) {
 			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 							  stat->lru_mask);
 			seq_printf(m, " N%d=%lu", nid, nr);
 		}
 		seq_putc(m, '\n');
 	}
 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 		struct mem_cgroup *iter;
 		nr = 0;
 		for_each_mem_cgroup_tree(iter, memcg)
 			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
 		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
 		for_each_node_state(nid, N_MEMORY) {
 			nr = 0;
 			for_each_mem_cgroup_tree(iter, memcg)
 				nr += mem_cgroup_node_nr_lru_pages(
 					iter, nid, stat->lru_mask);
 			seq_printf(m, " N%d=%lu", nid, nr);
 		}
 		seq_putc(m, '\n');
 	}
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 static inline void mem_cgroup_lru_names_not_uptodate(void)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
 static int memcg_stat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	struct mem_cgroup *mi;
 	unsigned int i;
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
 			   mem_cgroup_read_events(memcg, i));
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
 			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
 	/* Hierarchical information */
 	{
 		unsigned long long limit, memsw_limit;
 		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
 		seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
 		if (do_swap_account)
 			seq_printf(m, "hierarchical_memsw_limit %llu\n",
 				   memsw_limit);
 	}
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long long val = 0;
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
 		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_events(mi, i);
 		seq_printf(m, "total_%s %llu\n",
 			   mem_cgroup_events_names[i], val);
 	}
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
 	}
 #ifdef CONFIG_DEBUG_VM
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
 		struct zone_reclaim_stat *rstat;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = mem_cgroup_zoneinfo(memcg, nid, zid);
 				rstat = &mz->lruvec.reclaim_stat;
 				recent_rotated[0] += rstat->recent_rotated[0];
 				recent_rotated[1] += rstat->recent_rotated[1];
 				recent_scanned[0] += rstat->recent_scanned[0];
 				recent_scanned[1] += rstat->recent_scanned[1];
 			}
 		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
 		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
 		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
 		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
 	}
 #endif
 	return 0;
 }
 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	return mem_cgroup_swappiness(memcg);
 }
 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 				       struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
 	if (val > 100 || !parent)
 		return -EINVAL;
 	mutex_lock(&memcg_create_mutex);
 	/* If under hierarchy, only empty-root can set this value */
 	if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
 		mutex_unlock(&memcg_create_mutex);
 		return -EINVAL;
 	}
 	memcg->swappiness = val;
 	mutex_unlock(&memcg_create_mutex);
 	return 0;
 }
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
 	u64 usage;
 	int i;
 	rcu_read_lock();
 	if (!swap)
 		t = rcu_dereference(memcg->thresholds.primary);
 	else
 		t = rcu_dereference(memcg->memsw_thresholds.primary);
 	if (!t)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, swap);
 	/*
 	 * current_threshold points to threshold just below or equal to usage.
 	 * If it's not true, a threshold was crossed after last
 	 * call of __mem_cgroup_threshold().
 	 */
 	i = t->current_threshold;
 	/*
 	 * Iterate backward over array of thresholds starting from
 	 * current_threshold and check if a threshold is crossed.
 	 * If none of thresholds below usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* i = current_threshold + 1 */
 	i++;
 	/*
 	 * Iterate forward over array of thresholds starting from
 	 * current_threshold+1 and check if a threshold is crossed.
 	 * If none of thresholds above usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* Update current_threshold */
 	t->current_threshold = i - 1;
 unlock:
 	rcu_read_unlock();
 }
 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 {
 	while (memcg) {
 		__mem_cgroup_threshold(memcg, false);
 		if (do_swap_account)
 			__mem_cgroup_threshold(memcg, true);
 		memcg = parent_mem_cgroup(memcg);
 	}
 }
 static int compare_thresholds(const void *a, const void *b)
 {
 	const struct mem_cgroup_threshold *_a = a;
 	const struct mem_cgroup_threshold *_b = b;
 	if (_a->threshold > _b->threshold)
 		return 1;
 	if (_a->threshold < _b->threshold)
 		return -1;
 	return 0;
 }
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_eventfd_list *ev;
 	list_for_each_entry(ev, &memcg->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 	return 0;
 }
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		mem_cgroup_oom_notify_cb(iter);
 }
 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 {
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	u64 threshold, usage;
 	int i, size, ret;
 	ret = res_counter_memparse_write_strategy(args, &threshold);
 	if (ret)
 		return ret;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before adding a new one */
 	if (thresholds->primary)
 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 	/* Allocate memory for new array of thresholds */
 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
 			GFP_KERNEL);
 	if (!new) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 	new->size = size;
 	/* Copy thresholds (if any) to new array */
 	if (thresholds->primary) {
 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
 				sizeof(struct mem_cgroup_threshold));
 	}
 	/* Add new threshold */
 	new->entries[size - 1].eventfd = eventfd;
 	new->entries[size - 1].threshold = threshold;
 	/* Sort thresholds. Registering of new threshold isn't time-critical */
 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
 			compare_thresholds, NULL);
 	/* Find current threshold */
 	new->current_threshold = -1;
 	for (i = 0; i < size; i++) {
 		if (new->entries[i].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used until
 			 * rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		} else
 			break;
 	}
 	/* Free old spare buffer and save old primary buffer as spare */
 	kfree(thresholds->spare);
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 	return ret;
 }
 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
 }
 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
 }
 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, enum res_type type)
 {
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	u64 usage;
 	int i, j, size;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM)
 		thresholds = &memcg->thresholds;
 	else if (type == _MEMSWAP)
 		thresholds = &memcg->memsw_thresholds;
 	else
 		BUG();
 	if (!thresholds->primary)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
 	/* Check if a threshold crossed before removing */
 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	/* Calculate new number of threshold */
 	size = 0;
 	for (i = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd != eventfd)
 			size++;
 	}
 	new = thresholds->spare;
 	/* Set thresholds array to NULL if we don't have thresholds */
 	if (!size) {
 		kfree(new);
 		new = NULL;
 		goto swap_buffers;
 	}
 	new->size = size;
 	/* Copy thresholds and find current threshold */
 	new->current_threshold = -1;
 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd == eventfd)
 			continue;
 		new->entries[j] = thresholds->primary->entries[i];
 		if (new->entries[j].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used
 			 * until rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 		j++;
 	}
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
 	/* If all events are unregistered, free the spare array */
 	if (!new) {
 		kfree(thresholds->spare);
 		thresholds->spare = NULL;
 	}
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 }
 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
 }
 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
 }
 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup_eventfd_list *event;
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	spin_lock(&memcg_oom_lock);
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
 	/* already in OOM ? */
 	if (atomic_read(&memcg->under_oom))
 		eventfd_signal(eventfd, 1);
 	spin_unlock(&memcg_oom_lock);
 	return 0;
 }
 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 	spin_lock(&memcg_oom_lock);
 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
 			list_del(&ev->list);
 			kfree(ev);
 		}
 	}
 	spin_unlock(&memcg_oom_lock);
 }
 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 	seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
 	return 0;
 }
 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 	struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
 	if (!parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 	mutex_lock(&memcg_create_mutex);
 	/* oom-kill-disable is a flag for subhierarchy. */
 	if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
 		mutex_unlock(&memcg_create_mutex);
 		return -EINVAL;
 	}
 	memcg->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(memcg);
 	mutex_unlock(&memcg_create_mutex);
 	return 0;
 }
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	int ret;
 	memcg->kmemcg_id = -1;
 	ret = memcg_propagate_kmem(memcg);
 	if (ret)
 		return ret;
 	return mem_cgroup_sockets_init(memcg, ss);
 }
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 	mem_cgroup_sockets_destroy(memcg);
 }
 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 {
 	if (!memcg_kmem_is_active(memcg))
 		return;
 	/*
 	 * kmem charges can outlive the cgroup. In the case of slab
 	 * pages, for instance, a page contain objects from various
 	 * processes. As we prevent from taking a reference for every
 	 * such allocation we have to be careful when doing uncharge
 	 * (see memcg_uncharge_kmem) and here during offlining.
 	 *
 	 * The idea is that that only the _last_ uncharge which sees
 	 * the dead memcg will drop the last reference. An additional
 	 * reference is taken here before the group is marked dead
 	 * which is then paired with css_put during uncharge resp. here.
 	 *
 	 * Although this might sound strange as this path is called from
 	 * css_offline() when the referencemight have dropped down to 0
 	 * and shouldn't be incremented anymore (css_tryget would fail)
 	 * we do not have other options because of the kmem allocations
 	 * lifetime.
 	 */
 	css_get(&memcg->css);
 	memcg_kmem_mark_dead(memcg);
 	if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
 		return;
 	if (memcg_kmem_test_and_clear_dead(memcg))
 		css_put(&memcg->css);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 {
 }
 #endif
 /*
  * DO NOT USE IN NEW FILES.
  *
  * "cgroup.event_control" implementation.
  *
  * This is way over-engineered.  It tries to support fully configurable
  * events for each user.  Such level of flexibility is completely
  * unnecessary especially in the light of the planned unified hierarchy.
  *
  * Please deprecate this and replace with something simpler if at all
  * possible.
  */
 /*
  * Unregister event and free resources.
  *
  * Gets called from workqueue.
  */
 static void memcg_event_remove(struct work_struct *work)
 {
 	struct mem_cgroup_event *event =
 		container_of(work, struct mem_cgroup_event, remove);
 	struct mem_cgroup *memcg = event->memcg;
 	remove_wait_queue(event->wqh, &event->wait);
 	event->unregister_event(memcg, event->eventfd);
 	/* Notify userspace the event is going away. */
 	eventfd_signal(event->eventfd, 1);
 	eventfd_ctx_put(event->eventfd);
 	kfree(event);
 	css_put(&memcg->css);
 }
 /*
  * Gets called on POLLHUP on eventfd when user closes it.
  *
  * Called with wqh->lock held and interrupts disabled.
  */
 static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
 			    int sync, void *key)
 {
 	struct mem_cgroup_event *event =
 		container_of(wait, struct mem_cgroup_event, wait);
 	struct mem_cgroup *memcg = event->memcg;
 	unsigned long flags = (unsigned long)key;
 	if (flags & POLLHUP) {
 		/*
 		 * If the event has been detached at cgroup removal, we
 		 * can simply return knowing the other side will cleanup
 		 * for us.
 		 *
 		 * We can't race against event freeing since the other
 		 * side will require wqh->lock via remove_wait_queue(),
 		 * which we hold.
 		 */
 		spin_lock(&memcg->event_list_lock);
 		if (!list_empty(&event->list)) {
 			list_del_init(&event->list);
 			/*
 			 * We are in atomic context, but cgroup_event_remove()
 			 * may sleep, so we have to call it in workqueue.
 			 */
 			schedule_work(&event->remove);
 		}
 		spin_unlock(&memcg->event_list_lock);
 	}
 	return 0;
 }
 static void memcg_event_ptable_queue_proc(struct file *file,
 		wait_queue_head_t *wqh, poll_table *pt)
 {
 	struct mem_cgroup_event *event =
 		container_of(pt, struct mem_cgroup_event, pt);
 	event->wqh = wqh;
 	add_wait_queue(wqh, &event->wait);
 }
 /*
  * DO NOT USE IN NEW FILES.
  *
  * Parse input and register new cgroup event handler.
  *
  * Input must be in format '<event_fd> <control_fd> <args>'.
  * Interpretation of args is defined by control file implementation.
  */
 static int memcg_write_event_control(struct cgroup_subsys_state *css,
 				     struct cftype *cft, char *buffer)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event;
 	struct cgroup_subsys_state *cfile_css;
 	unsigned int efd, cfd;
 	struct fd efile;
 	struct fd cfile;
 	const char *name;
 	char *endp;
 	int ret;
 	efd = simple_strtoul(buffer, &endp, 10);
 	if (*endp != ' ')
 		return -EINVAL;
 	buffer = endp + 1;
 	cfd = simple_strtoul(buffer, &endp, 10);
 	if ((*endp != ' ') && (*endp != '\0'))
 		return -EINVAL;
 	buffer = endp + 1;
 	event = kzalloc(sizeof(*event), GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	event->memcg = memcg;
 	INIT_LIST_HEAD(&event->list);
 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
 	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
 	INIT_WORK(&event->remove, memcg_event_remove);
 	efile = fdget(efd);
 	if (!efile.file) {
 		ret = -EBADF;
 		goto out_kfree;
 	}
 	event->eventfd = eventfd_ctx_fileget(efile.file);
 	if (IS_ERR(event->eventfd)) {
 		ret = PTR_ERR(event->eventfd);
 		goto out_put_efile;
 	}
 	cfile = fdget(cfd);
 	if (!cfile.file) {
 		ret = -EBADF;
 		goto out_put_eventfd;
 	}
 	/* the process need read permission on control file */
 	/* AV: shouldn't we check that it's been opened for read instead? */
 	ret = inode_permission(file_inode(cfile.file), MAY_READ);
 	if (ret < 0)
 		goto out_put_cfile;
 	/*
 	 * Determine the event callbacks and set them in @event.  This used
 	 * to be done via struct cftype but cgroup core no longer knows
 	 * about these events.  The following is crude but the whole thing
 	 * is for compatibility anyway.
 	 *
 	 * DO NOT ADD NEW FILES.
 	 */
 	name = cfile.file->f_dentry->d_name.name;
 	if (!strcmp(name, "memory.usage_in_bytes")) {
 		event->register_event = mem_cgroup_usage_register_event;
 		event->unregister_event = mem_cgroup_usage_unregister_event;
 	} else if (!strcmp(name, "memory.oom_control")) {
 		event->register_event = mem_cgroup_oom_register_event;
 		event->unregister_event = mem_cgroup_oom_unregister_event;
 	} else if (!strcmp(name, "memory.pressure_level")) {
 		event->register_event = vmpressure_register_event;
 		event->unregister_event = vmpressure_unregister_event;
 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
 		event->register_event = memsw_cgroup_usage_register_event;
 		event->unregister_event = memsw_cgroup_usage_unregister_event;
 	} else {
 		ret = -EINVAL;
 		goto out_put_cfile;
 	}
 	/*
 	 * Verify @cfile should belong to @css.  Also, remaining events are
 	 * automatically removed on cgroup destruction but the removal is
 	 * asynchronous, so take an extra ref on @css.
 	 */
 	cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
 					&memory_cgrp_subsys);
 	ret = -EINVAL;
 	if (IS_ERR(cfile_css))
 		goto out_put_cfile;
 	if (cfile_css != css) {
 		css_put(cfile_css);
 		goto out_put_cfile;
 	}
 	ret = event->register_event(memcg, event->eventfd, buffer);
 	if (ret)
 		goto out_put_css;
 	efile.file->f_op->poll(efile.file, &event->pt);
 	spin_lock(&memcg->event_list_lock);
 	list_add(&event->list, &memcg->event_list);
 	spin_unlock(&memcg->event_list_lock);
 	fdput(cfile);
 	fdput(efile);
 	return 0;
 out_put_css:
 	css_put(css);
 out_put_cfile:
 	fdput(cfile);
 out_put_eventfd:
 	eventfd_ctx_put(event->eventfd);
 out_put_efile:
 	fdput(efile);
 out_kfree:
 	kfree(event);
 	return ret;
 }
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "stat",
 		.seq_show = memcg_stat_show,
 	},
 	{
 		.name = "force_empty",
 		.trigger = mem_cgroup_force_empty_write,
 	},
 	{
 		.name = "use_hierarchy",
 		.flags = CFTYPE_INSANE,
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
 		.name = "cgroup.event_control",		/* XXX: for compat */
 		.write_string = memcg_write_event_control,
 		.flags = CFTYPE_NO_PREFIX,
 		.mode = S_IWUGO,
 	},
 	{
 		.name = "swappiness",
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
 	},
 	{
 		.name = "oom_control",
 		.seq_show = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 	{
 		.name = "pressure_level",
 	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
 		.seq_show = memcg_numa_stat_show,
 	},
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 	{
 		.name = "kmem.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.failcnt",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 #ifdef CONFIG_SLABINFO
 	{
 		.name = "kmem.slabinfo",
 		.seq_show = mem_cgroup_slabinfo_read,
 	},
 #endif
 #endif
 	{ },	/* terminate */
 };
 #ifdef CONFIG_MEMCG_SWAP
 static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write_string = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{ },	/* terminate */
 };
 #endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		lruvec_init(&mz->lruvec);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
 	}
 	memcg->nodeinfo[node] = pn;
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	kfree(memcg->nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
 	size_t size;
 	size = sizeof(struct mem_cgroup);
 	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
 	memcg = kzalloc(size, GFP_KERNEL);
 	if (!memcg)
 		return NULL;
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 out_free:
 	kfree(memcg);
 	return NULL;
 }
 /*
  * At destroying mem_cgroup, references from swap_cgroup can remain.
  * (scanning all at force_empty is too costly...)
  *
  * Instead of clearing all references at force_empty, we remember
  * the number of reference from swap_cgroup and free mem_cgroup when
  * it goes down to 0.
  *
  * Removal of cgroup itself succeeds regardless of refs from swap.
  */
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_remove_from_trees(memcg);
 	for_each_node(node)
 		free_mem_cgroup_per_zone_info(memcg, node);
 	free_percpu(memcg->stat);
 	/*
 	 * We need to make sure that (at least for now), the jump label
 	 * destruction code runs outside of the cgroup lock. This is because
 	 * get_online_cpus(), which is called from the static_branch update,
 	 * can't be called inside the cgroup_lock. cpusets are the ones
 	 * enforcing this dependency, so if they ever change, we might as well.
 	 *
 	 * schedule_work() will guarantee this happens. Be careful if you need
 	 * to move this code around, and make sure it is outside
 	 * the cgroup_lock.
 	 */
 	disarm_static_keys(memcg);
 	kfree(memcg);
 }
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg->res.parent)
 		return NULL;
 	return mem_cgroup_from_res_counter(memcg->res.parent, res);
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 static void __init mem_cgroup_soft_limit_tree_init(void)
 {
 	struct mem_cgroup_tree_per_node *rtpn;
 	struct mem_cgroup_tree_per_zone *rtpz;
 	int tmp, node, zone;
 	for_each_node(node) {
 		tmp = node;
 		if (!node_state(node, N_NORMAL_MEMORY))
 			tmp = -1;
 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
 		BUG_ON(!rtpn);
 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			rtpz = &rtpn->rb_tree_per_zone[zone];
 			rtpz->rb_root = RB_ROOT;
 			spin_lock_init(&rtpz->lock);
 		}
 	}
 }
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct mem_cgroup *memcg;
 	long error = -ENOMEM;
 	int node;
 	memcg = mem_cgroup_alloc();
 	if (!memcg)
 		return ERR_PTR(error);
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_zone_info(memcg, node))
 			goto free_out;
 	/* root ? */
 	if (parent_css == NULL) {
 		root_mem_cgroup = memcg;
 		res_counter_init(&memcg->res, NULL);
 		res_counter_init(&memcg->memsw, NULL);
 		res_counter_init(&memcg->kmem, NULL);
 	}
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
 	return &memcg->css;
 free_out:
 	__mem_cgroup_free(memcg);
 	return ERR_PTR(error);
 }
 static int
 mem_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
 	if (css->cgroup->id > MEM_CGROUP_ID_MAX)
 		return -ENOSPC;
 	if (!parent)
 		return 0;
 	mutex_lock(&memcg_create_mutex);
 	memcg->use_hierarchy = parent->use_hierarchy;
 	memcg->oom_kill_disable = parent->oom_kill_disable;
 	memcg->swappiness = mem_cgroup_swappiness(parent);
 	if (parent->use_hierarchy) {
 		res_counter_init(&memcg->res, &parent->res);
 		res_counter_init(&memcg->memsw, &parent->memsw);
 		res_counter_init(&memcg->kmem, &parent->kmem);
 		/*
 		 * No need to take a reference to the parent because cgroup
 		 * core guarantees its existence.
 		 */
 	} else {
 		res_counter_init(&memcg->res, NULL);
 		res_counter_init(&memcg->memsw, NULL);
 		res_counter_init(&memcg->kmem, NULL);
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
 		 * unfortunate state in our controller.
 		 */
 		if (parent != root_mem_cgroup)
 			memory_cgrp_subsys.broken_hierarchy = true;
 	}
 	mutex_unlock(&memcg_create_mutex);
 	return memcg_init_kmem(memcg, &memory_cgrp_subsys);
 }
 /*
  * Announce all parents that a group from their hierarchy is gone.
  */
 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *parent = memcg;
 	while ((parent = parent_mem_cgroup(parent)))
 		mem_cgroup_iter_invalidate(parent);
 	/*
 	 * if the root memcg is not hierarchical we have to check it
 	 * explicitely.
 	 */
 	if (!root_mem_cgroup->use_hierarchy)
 		mem_cgroup_iter_invalidate(root_mem_cgroup);
 }
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event, *tmp;
 	struct cgroup_subsys_state *iter;
 	/*
 	 * Unregister events and notify userspace.
 	 * Notify userspace about cgroup removing only after rmdir of cgroup
 	 * directory to avoid race between userspace and kernelspace.
 	 */
 	spin_lock(&memcg->event_list_lock);
 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
 		list_del_init(&event->list);
 		schedule_work(&event->remove);
 	}
 	spin_unlock(&memcg->event_list_lock);
 	kmem_cgroup_css_offline(memcg);
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 	/*
 	 * This requires that offlining is serialized.  Right now that is
 	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
 	 */
 	css_for_each_descendant_post(iter, css)
 		mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
 	mem_cgroup_destroy_all_caches(memcg);
 	vmpressure_cleanup(&memcg->vmpressure);
 }
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	/*
 	 * XXX: css_offline() would be where we should reparent all
 	 * memory to prepare the cgroup for destruction.  However,
 	 * memcg does not do css_tryget() and res_counter charging
 	 * under the same RCU lock region, which means that charging
 	 * could race with offlining.  Offlining only happens to
 	 * cgroups with no tasks in them but charges can show up
 	 * without any tasks from the swapin path when the target
 	 * memcg is looked up from the swapout record and not from the
 	 * current task as it usually is.  A race like this can leak
 	 * charges and put pages with stale cgroup pointers into
 	 * circulation:
 	 *
 	 * #0                        #1
 	 *                           lookup_swap_cgroup_id()
 	 *                           rcu_read_lock()
 	 *                           mem_cgroup_lookup()
 	 *                           css_tryget()
 	 *                           rcu_read_unlock()
 	 * disable css_tryget()
 	 * call_rcu()
 	 *   offline_css()
 	 *     reparent_charges()
 	 *                           res_counter_charge()
 	 *                           css_put()
 	 *                             css_free()
 	 *                           pc->mem_cgroup = dead memcg
 	 *                           add page to lru
 	 *
 	 * The bulk of the charges are still moved in offline_css() to
 	 * avoid pinning a lot of pages in case a long-term reference
 	 * like a swapout record is deferring the css_free() to long
 	 * after offlining.  But this makes sure we catch any charges
 	 * made after offlining:
 	 */
 	mem_cgroup_reparent_charges(memcg);
 	memcg_destroy_kmem(memcg);
 	__mem_cgroup_free(memcg);
 }
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE	256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
 	int batch_count = PRECHARGE_COUNT_AT_ONCE;
 	struct mem_cgroup *memcg = mc.to;
 	if (mem_cgroup_is_root(memcg)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
 	}
 	/* try to charge at once */
 	if (count > 1) {
 		struct res_counter *dummy;
 		/*
 		 * "memcg" cannot be under rmdir() because we've already checked
 		 * by cgroup_lock_live_cgroup() that it is not removed and we
 		 * are still under the same cgroup_mutex. So we can postpone
 		 * css_get().
 		 */
 		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
 			goto one_by_one;
 		if (do_swap_account && res_counter_charge(&memcg->memsw,
 						PAGE_SIZE * count, &dummy)) {
 			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
 			goto one_by_one;
 		}
 		mc.precharge += count;
 		return ret;
 	}
 one_by_one:
 	/* fall back to one by one charge */
 	while (count--) {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		if (!batch_count--) {
 			batch_count = PRECHARGE_COUNT_AT_ONCE;
 			cond_resched();
 		}
 		ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
 		if (ret)
 			/* mem_cgroup_clear_mc() will do uncharge later */
 			return ret;
 		mc.precharge++;
 	}
 	return ret;
 }
 /**
  * get_mctgt_type - get target type of moving charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
  * @target: the pointer the target page or swap ent will be stored(can be NULL)
  *
  * Returns
  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  *     move charge. if @target is not NULL, the page is stored in target->page
  *     with extra refcnt got(Callers should handle it).
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
  *
  * Called with pte lock held.
  */
 union mc_target {
 	struct page	*page;
 	swp_entry_t	ent;
 };
 enum mc_target_type {
 	MC_TARGET_NONE = 0,
 	MC_TARGET_PAGE,
 	MC_TARGET_SWAP,
 };
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
 	struct page *page = vm_normal_page(vma, addr, ptent);
 	if (!page || !page_mapped(page))
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
 		if (!move_anon())
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
 		return NULL;
 	if (!get_page_unless_zero(page))
 		return NULL;
 	return page;
 }
 #ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
 	/*
 	 * Because lookup_swap_cache() updates some statistics counter,
 	 * we call find_get_page() with swapper_space directly.
 	 */
 	page = find_get_page(swap_address_space(ent), ent.val);
 	if (do_swap_account)
 		entry->val = ent.val;
 	return page;
 }
 #else
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	return NULL;
 }
 #endif
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	struct address_space *mapping;
 	pgoff_t pgoff;
 	if (!vma->vm_file) /* anonymous vma */
 		return NULL;
 	if (!move_file())
 		return NULL;
 	mapping = vma->vm_file->f_mapping;
 	if (pte_none(ptent))
 		pgoff = linear_page_index(vma, addr);
 	else /* pte_file(ptent) is true */
 		pgoff = pte_to_pgoff(ptent);
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 	page = find_get_page(mapping, pgoff);
 #ifdef CONFIG_SWAP
 	/* shmem/tmpfs may report page out on swap: account for that too. */
 	if (radix_tree_exceptional_entry(page)) {
 		swp_entry_t swap = radix_to_swp_entry(page);
 		if (do_swap_account)
 			*entry = swap;
 		page = find_get_page(swap_address_space(swap), swap.val);
 	}
 #endif
 	return page;
 }
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	swp_entry_t ent = { .val = 0 };
 	if (pte_present(ptent))
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
 	else if (pte_none(ptent) || pte_file(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 	if (!page && !ent.val)
 		return ret;
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
 		 * Do only loose check w/o page_cgroup lock.
 		 * mem_cgroup_move_account() checks the pc is valid or not under
 		 * the lock.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;
 		}
 		if (!ret || !target)
 			put_page(page);
 	}
 	/* There is a swap entry and a page doesn't exist or isn't charged */
 	if (ent.val && !ret &&
 	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)
 			target->ent = ent;
 	}
 	return ret;
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * We don't consider swapping or file mapped pages because THP does not
  * support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	page = pmd_page(pmd);
 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
 	if (!move_anon())
 		return ret;
 	pc = lookup_page_cgroup(page);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
 			get_page(page);
 			target->page = page;
 		}
 	}
 	return ret;
 }
 #else
 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	return MC_TARGET_NONE;
 }
 #endif
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 					unsigned long addr, unsigned long end,
 					struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(ptl);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		if (get_mctgt_type(vma, addr, *pte, NULL))
 			mc.precharge++;	/* increment precharge temporarily */
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
 }
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
 	unsigned long precharge;
 	struct vm_area_struct *vma;
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		struct mm_walk mem_cgroup_count_precharge_walk = {
 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
 	}
 	up_read(&mm->mmap_sem);
 	precharge = mc.precharge;
 	mc.precharge = 0;
 	return precharge;
 }
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
 	unsigned long precharge = mem_cgroup_count_precharge(mm);
 	VM_BUG_ON(mc.moving_task);
 	mc.moving_task = current;
 	return mem_cgroup_do_precharge(precharge);
 }
 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
 static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
 	int i;
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
 		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
 		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 		/* uncharge swap account from the old cgroup */
 		if (!mem_cgroup_is_root(mc.from))
 			res_counter_uncharge(&mc.from->memsw,
 						PAGE_SIZE * mc.moved_swap);
 		for (i = 0; i < mc.moved_swap; i++)
 			css_put(&mc.from->css);
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
 			 * we charged both to->res and to->memsw, so we should
 			 * uncharge to->res.
 			 */
 			res_counter_uncharge(&mc.to->res,
 						PAGE_SIZE * mc.moved_swap);
 		}
 		/* we've already done css_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
 }
 static void mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	/*
 	 * we must clear moving_task before waking up waiters at the end of
 	 * task migration.
 	 */
 	mc.moving_task = NULL;
 	__mem_cgroup_clear_mc();
 	spin_lock(&mc.lock);
 	mc.from = NULL;
 	mc.to = NULL;
 	spin_unlock(&mc.lock);
 	mem_cgroup_end_move(from);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	int ret = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	unsigned long move_charge_at_immigrate;
 	/*
 	 * We are now commited to this value whatever it is. Changes in this
 	 * tunable will only affect upcoming migrations, not the current one.
 	 * So we need to save it, and keep it going.
 	 */
 	move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
 	if (move_charge_at_immigrate) {
 		struct mm_struct *mm;
 		struct mem_cgroup *from = mem_cgroup_from_task(p);
 		VM_BUG_ON(from == memcg);
 		mm = get_task_mm(p);
 		if (!mm)
 			return 0;
 		/* We move charges only when we move a owner of the mm */
 		if (mm->owner == p) {
 			VM_BUG_ON(mc.from);
 			VM_BUG_ON(mc.to);
 			VM_BUG_ON(mc.precharge);
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = memcg;
 			mc.immigrate_flags = move_charge_at_immigrate;
 			spin_unlock(&mc.lock);
 			/* We set mc.moving_task later */
 			ret = mem_cgroup_precharge_mc(mm);
 			if (ret)
 				mem_cgroup_clear_mc();
 		}
 		mmput(mm);
 	}
 	return ret;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
 				     struct cgroup_taskset *tset)
 {
 	mem_cgroup_clear_mc();
 }
 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
 	int ret = 0;
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	enum mc_target_type target_type;
 	union mc_target target;
 	struct page *page;
 	struct page_cgroup *pc;
 	/*
 	 * We don't take compound_lock() here but no race with splitting thp
 	 * happens because:
 	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
 	 *    under splitting, which means there's no concurrent thp split,
 	 *  - if another thread runs into split_huge_page() just after we
 	 *    entered this if-block, the thread must wait for page table lock
 	 *    to be unlocked in __split_huge_page_splitting(), where the main
 	 *    part of thp split is not executed yet.
 	 */
 	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(ptl);
 			return 0;
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
 			page = target.page;
 			if (!isolate_lru_page(page)) {
 				pc = lookup_page_cgroup(page);
 				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
 							pc, mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
 				putback_lru_page(page);
 			}
 			put_page(page);
 		}
 		spin_unlock(ptl);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 retry:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
 		swp_entry_t ent;
 		if (!mc.precharge)
 			break;
 		switch (get_mctgt_type(vma, addr, ptent, &target)) {
 		case MC_TARGET_PAGE:
 			page = target.page;
 			if (isolate_lru_page(page))
 				goto put;
 			pc = lookup_page_cgroup(page);
 			if (!mem_cgroup_move_account(page, 1, pc,
 						     mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			putback_lru_page(page);
 put:			/* get_mctgt_type() gets the page */
 			put_page(page);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
 			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end) {
 		/*
 		 * We have consumed all precharges we got in can_attach().
 		 * We try charge one by one, but don't do any additional
 		 * charges to mc.to if we have failed in charge once in attach()
 		 * phase.
 		 */
 		ret = mem_cgroup_do_precharge(1);
 		if (!ret)
 			goto retry;
 	}
 	return ret;
 }
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	lru_add_drain_all();
 retry:
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		/*
 		 * Someone who are holding the mmap_sem might be waiting in
 		 * waitq. So we cancel all extra charges, wake up all waiters,
 		 * and retry. Because we cancel precharges, we might not be able
 		 * to move enough charges, but moving charge is a best-effort
 		 * feature anyway, so it wouldn't be a big problem.
 		 */
 		__mem_cgroup_clear_mc();
 		cond_resched();
 		goto retry;
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		int ret;
 		struct mm_walk mem_cgroup_move_charge_walk = {
 			.pmd_entry = mem_cgroup_move_charge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		ret = walk_page_range(vma->vm_start, vma->vm_end,
 						&mem_cgroup_move_charge_walk);
 		if (ret)
 			/*
 			 * means we have consumed all precharges and failed in
 			 * doing additional charge. Just abandon here.
 			 */
 			break;
 	}
 	up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	struct mm_struct *mm = get_task_mm(p);
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
 		mmput(mm);
 	}
 	if (mc.to)
 		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
 				     struct cgroup_taskset *tset)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 }
 #endif
 /*
  * Cgroup retains root cgroups across [un]mount cycles making it necessary
  * to verify sane_behavior flag on each mount attempt.
  */
 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 {
 	/*
 	 * use_hierarchy is forced with sane_behavior.  cgroup core
 	 * guarantees that @root doesn't have any children, so turning it
 	 * on for the root memcg is enough.
 	 */
 	if (cgroup_sane_behavior(root_css->cgroup))
 		mem_cgroup_from_css(root_css)->use_hierarchy = true;
 }
 struct cgroup_subsys memory_cgrp_subsys = {
 	.css_alloc = mem_cgroup_css_alloc,
 	.css_online = mem_cgroup_css_online,
 	.css_offline = mem_cgroup_css_offline,
 	.css_free = mem_cgroup_css_free,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.bind = mem_cgroup_bind,
 	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 };
 #ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
 	if (!strcmp(s, "1"))
 		really_do_swap_account = 1;
 	else if (!strcmp(s, "0"))
 		really_do_swap_account = 0;
 	return 1;
 }
 __setup("swapaccount=", enable_swap_account);
 static void __init memsw_file_init(void)
 {
 	WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
 }
 static void __init enable_swap_cgroup(void)
 {
 	if (!mem_cgroup_disabled() && really_do_swap_account) {
 		do_swap_account = 1;
 		memsw_file_init();
 	}
 }
 #else
 static void __init enable_swap_cgroup(void)
 {
 }
 #endif
 /*
  * subsys_initcall() for memory controller.
  *
  * Some parts like hotcpu_notifier() have to be initialized from this context
  * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
  * everything that doesn't depend on a specific mem_cgroup structure should
  * be initialized from here.
  */
 static int __init mem_cgroup_init(void)
 {
 	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	enable_swap_cgroup();
 	mem_cgroup_soft_limit_tree_init();
 	memcg_stock_init();
 	return 0;
 }
 subsys_initcall(mem_cgroup_init);

 /*
  * Slab allocator functions that are independent of the allocator strategy
  *
  * (C) 2012 Christoph Lameter <cl@linux.com>
  */
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/poison.h>
 #include <linux/interrupt.h>
 #include <linux/memory.h>
 #include <linux/compiler.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
 #include <linux/memcontrol.h>
 #include <trace/events/kmem.h>
 #include "slab.h"
 enum slab_state slab_state;
 LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 #ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+static int kmem_cache_sanity_check(const char *name, size_t size)
-				   size_t size)
 {
 	struct kmem_cache *s = NULL;
 	if (!name || in_interrupt() || size < sizeof(void *) ||
 		size > KMALLOC_MAX_SIZE) {
 		pr_err("kmem_cache_create(%s) integrity check failed\n", name);
 		return -EINVAL;
 	}
 	list_for_each_entry(s, &slab_caches, list) {
 		char tmp;
 		int res;
 		/*
 		 * This happens when the module gets unloaded and doesn't
 		 * destroy its slab cache and no-one else reuses the vmalloc
 		 * area of the module.  Print a warning.
 		 */
 		res = probe_kernel_address(s->name, tmp);
 		if (res) {
 			pr_err("Slab cache with size %d has lost its name\n",
 			       s->object_size);
 			continue;
 		}
 #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
-		/*
+		if (!strcmp(s->name, name)) {
-		 * For simplicity, we won't check this in the list of memcg
-		 * caches. We have control over memcg naming, and if there
-		 * aren't duplicates in the global list, there won't be any
-		 * duplicates in the memcg lists as well.
-		 */
-		if (!memcg && !strcmp(s->name, name)) {
 			pr_err("%s (%s): Cache name already exists.\n",
 			       __func__, name);
 			dump_stack();
 			s = NULL;
 			return -EINVAL;
 		}
 #endif
 	}
 	WARN_ON(strchr(name, ' '));	/* It confuses parsers */
 	return 0;
 }
 #else
-static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
-					  const char *name, size_t size)
 {
 	return 0;
 }
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 int memcg_update_all_caches(int num_memcgs)
 {
 	struct kmem_cache *s;
 	int ret = 0;
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
 		if (!is_root_cache(s))
 			continue;
 		ret = memcg_update_cache_size(s, num_memcgs);
 		/*
 		 * See comment in memcontrol.c, memcg_update_cache_size:
 		 * Instead of freeing the memory, we'll just leave the caches
 		 * up to this point in an updated state.
 		 */
 		if (ret)
 			goto out;
 	}
 	memcg_update_array_size(num_memcgs);
 out:
 	mutex_unlock(&slab_mutex);
 	return ret;
 }
 #endif
 /*
  * Figure out what the alignment of the objects will be given a set of
  * flags, a user specified alignment and the size of the objects.
  */
 unsigned long calculate_alignment(unsigned long flags,
 		unsigned long align, unsigned long size)
 {
 	/*
 	 * If the user wants hardware cache aligned objects then follow that
 	 * suggestion if the object is sufficiently large.
 	 *
 	 * The hardware cache alignment cannot override the specified
 	 * alignment though. If that is greater then use it.
 	 */
 	if (flags & SLAB_HWCACHE_ALIGN) {
 		unsigned long ralign = cache_line_size();
 		while (size <= ralign / 2)
 			ralign /= 2;
 		align = max(align, ralign);
 	}
 	if (align < ARCH_SLAB_MINALIGN)
 		align = ARCH_SLAB_MINALIGN;
 	return ALIGN(align, sizeof(void *));
 }
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+		     unsigned long flags, void (*ctor)(void *),
+		     struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	struct kmem_cache *s;
+	int err;
+	err = -ENOMEM;
+	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+	if (!s)
+		goto out;
+	s->name = name;
+	s->object_size = object_size;
+	s->size = size;
+	s->align = align;
+	s->ctor = ctor;
+	err = memcg_alloc_cache_params(memcg, s, root_cache);
+	if (err)
+		goto out_free_cache;
+	err = __kmem_cache_create(s, flags);
+	if (err)
+		goto out_free_cache;
+	s->refcount = 1;
+	list_add(&s->list, &slab_caches);
+	memcg_register_cache(s);
+out:
+	if (err)
+		return ERR_PTR(err);
+	return s;
+out_free_cache:
+	memcg_free_cache_params(s);
+	kfree(s);
+	goto out;
+}
 /*
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
  * @size: The size of objects to be created in this cache.
  * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @ctor: A constructor for the objects.
  *
  * Returns a ptr to the cache on success, NULL on failure.
  * Cannot be called within a interrupt, but can be interrupted.
  * The @ctor is run when new pages are allocated by the cache.
  *
  * The flags are
  *
  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
  * to catch references to uninitialised memory.
  *
  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
  * for buffer overruns.
  *
  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
 struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+kmem_cache_create(const char *name, size_t size, size_t align,
-			size_t align, unsigned long flags, void (*ctor)(void *),
+		  unsigned long flags, void (*ctor)(void *))
-			struct kmem_cache *parent_cache)
 {
-	struct kmem_cache *s = NULL;
+	struct kmem_cache *s;
+	char *cache_name;
 	int err;
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
-	err = kmem_cache_sanity_check(memcg, name, size);
+	err = kmem_cache_sanity_check(name, size);
 	if (err)
 		goto out_unlock;
-	if (memcg) {
-		/*
-		 * Since per-memcg caches are created asynchronously on first
-		 * allocation (see memcg_kmem_get_cache()), several threads can
-		 * try to create the same cache, but only one of them may
-		 * succeed. Therefore if we get here and see the cache has
-		 * already been created, we silently return NULL.
-		 */
-		if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
-			goto out_unlock;
-	}
 	/*
 	 * Some allocators will constraint the set of valid flags to a subset
 	 * of all flags. We expect them to define CACHE_CREATE_MASK in this
 	 * case, and we'll just provide them with a sanitized version of the
 	 * passed flags.
 	 */
 	flags &= CACHE_CREATE_MASK;
-	if (!memcg) {
+	s = __kmem_cache_alias(name, size, align, flags, ctor);
-		s = __kmem_cache_alias(name, size, align, flags, ctor);
+	if (s)
-		if (s)
+		goto out_unlock;
-			goto out_unlock;
-	}
-	err = -ENOMEM;
+	cache_name = kstrdup(name, GFP_KERNEL);
-	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+	if (!cache_name) {
-	if (!s)
+		err = -ENOMEM;
 		goto out_unlock;
+	}
-	s->object_size = s->size = size;
+	s = do_kmem_cache_create(cache_name, size, size,
-	s->align = calculate_alignment(flags, align, size);
+				 calculate_alignment(flags, align, size),
-	s->ctor = ctor;
+				 flags, ctor, NULL, NULL);
+	if (IS_ERR(s)) {
+		err = PTR_ERR(s);
+		kfree(cache_name);
+	}
-	if (memcg)
-		s->name = memcg_create_cache_name(memcg, parent_cache);
-	else
-		s->name = kstrdup(name, GFP_KERNEL);
-	if (!s->name)
-		goto out_free_cache;
-	err = memcg_alloc_cache_params(memcg, s, parent_cache);
-	if (err)
-		goto out_free_cache;
-	err = __kmem_cache_create(s, flags);
-	if (err)
-		goto out_free_cache;
-	s->refcount = 1;
-	list_add(&s->list, &slab_caches);
-	memcg_register_cache(s);
 out_unlock:
 	mutex_unlock(&slab_mutex);
 	put_online_cpus();
 	if (err) {
-		/*
-		 * There is no point in flooding logs with warnings or
-		 * especially crashing the system if we fail to create a cache
-		 * for a memcg. In this case we will be accounting the memcg
-		 * allocation to the root cgroup until we succeed to create its
-		 * own cache, but it isn't that critical.
-		 */
-		if (!memcg)
-			return NULL;
 		if (flags & SLAB_PANIC)
 			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
 				name, err);
 		else {
 			printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
 				name, err);
 			dump_stack();
 		}
 		return NULL;
 	}
 	return s;
-out_free_cache:
-	memcg_free_cache_params(s);
-	kfree(s->name);
-	kmem_cache_free(kmem_cache, s);
-	goto out_unlock;
 }
+EXPORT_SYMBOL(kmem_cache_create);
-struct kmem_cache *
+#ifdef CONFIG_MEMCG_KMEM
-kmem_cache_create(const char *name, size_t size, size_t align,
+/*
-		  unsigned long flags, void (*ctor)(void *))
+ * kmem_cache_create_memcg - Create a cache for a memory cgroup.
+ * @memcg: The memory cgroup the new cache is for.
+ * @root_cache: The parent of the new cache.
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
-	return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+	struct kmem_cache *s;
+	char *cache_name;
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	/*
+	 * Since per-memcg caches are created asynchronously on first
+	 * allocation (see memcg_kmem_get_cache()), several threads can try to
+	 * create the same cache, but only one of them may succeed.
+	 */
+	if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
+		goto out_unlock;
+	cache_name = memcg_create_cache_name(memcg, root_cache);
+	if (!cache_name)
+		goto out_unlock;
+	s = do_kmem_cache_create(cache_name, root_cache->object_size,
+				 root_cache->size, root_cache->align,
+				 root_cache->flags, root_cache->ctor,
+				 memcg, root_cache);
+	if (IS_ERR(s)) {
+		kfree(cache_name);
+		goto out_unlock;
+	}
+	s->allocflags |= __GFP_KMEMCG;
+out_unlock:
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
 }
-EXPORT_SYMBOL(kmem_cache_create);
+#endif /* CONFIG_MEMCG_KMEM */
 void kmem_cache_destroy(struct kmem_cache *s)
 {
 	/* Destroy all the children caches if we aren't a memcg cache */
 	kmem_cache_destroy_memcg_children(s);
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
 	s->refcount--;
 	if (!s->refcount) {
 		list_del(&s->list);
 		if (!__kmem_cache_shutdown(s)) {
 			memcg_unregister_cache(s);
 			mutex_unlock(&slab_mutex);
 			if (s->flags & SLAB_DESTROY_BY_RCU)
 				rcu_barrier();
 			memcg_free_cache_params(s);
 			kfree(s->name);
 			kmem_cache_free(kmem_cache, s);
 		} else {
 			list_add(&s->list, &slab_caches);
 			mutex_unlock(&slab_mutex);
 			printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
 				s->name);
 			dump_stack();
 		}
 	} else {
 		mutex_unlock(&slab_mutex);
 	}
 	put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 int slab_is_available(void)
 {
 	return slab_state >= UP;
 }
 #ifndef CONFIG_SLOB
 /* Create a cache during boot when no slab services are available yet */
 void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
 		unsigned long flags)
 {
 	int err;
 	s->name = name;
 	s->size = s->object_size = size;
 	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
 	err = __kmem_cache_create(s, flags);
 	if (err)
 		panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n",
 					name, size, err);
 	s->refcount = -1;	/* Exempt from merging for now */
 }
 struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
 				unsigned long flags)
 {
 	struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
 	if (!s)
 		panic("Out of memory when creating slab %s\n", name);
 	create_boot_cache(s, name, size, flags);
 	list_add(&s->list, &slab_caches);
 	s->refcount = 1;
 	return s;
 }
 struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
 EXPORT_SYMBOL(kmalloc_caches);
 #ifdef CONFIG_ZONE_DMA
 struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
 EXPORT_SYMBOL(kmalloc_dma_caches);
 #endif
 /*
  * Conversion table for small slabs sizes / 8 to the index in the
  * kmalloc array. This is necessary for slabs < 192 since we have non power
  * of two cache sizes there. The size of larger slabs can be determined using
  * fls.
  */
 static s8 size_index[24] = {
 	3,	/* 8 */
 	4,	/* 16 */
 	5,	/* 24 */
 	5,	/* 32 */
 	6,	/* 40 */
 	6,	/* 48 */
 	6,	/* 56 */
 	6,	/* 64 */
 	1,	/* 72 */
 	1,	/* 80 */
 	1,	/* 88 */
 	1,	/* 96 */
 	7,	/* 104 */
 	7,	/* 112 */
 	7,	/* 120 */
 	7,	/* 128 */
 	2,	/* 136 */
 	2,	/* 144 */
 	2,	/* 152 */
 	2,	/* 160 */
 	2,	/* 168 */
 	2,	/* 176 */
 	2,	/* 184 */
 	2	/* 192 */
 };
 static inline int size_index_elem(size_t bytes)
 {
 	return (bytes - 1) / 8;
 }
 /*
  * Find the kmem_cache structure that serves a given size of
  * allocation
  */
 struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 {
 	int index;
 	if (unlikely(size > KMALLOC_MAX_SIZE)) {
 		WARN_ON_ONCE(!(flags & __GFP_NOWARN));
 		return NULL;
 	}
 	if (size <= 192) {
 		if (!size)
 			return ZERO_SIZE_PTR;
 		index = size_index[size_index_elem(size)];
 	} else
 		index = fls(size - 1);
 #ifdef CONFIG_ZONE_DMA
 	if (unlikely((flags & GFP_DMA)))
 		return kmalloc_dma_caches[index];
 #endif
 	return kmalloc_caches[index];
 }
 /*
  * Create the kmalloc array. Some of the regular kmalloc arrays
  * may already have been created because they were needed to
  * enable allocations for slab creation.
  */
 void __init create_kmalloc_caches(unsigned long flags)
 {
 	int i;
 	/*
 	 * Patch up the size_index table if we have strange large alignment
 	 * requirements for the kmalloc array. This is only the case for
 	 * MIPS it seems. The standard arches will not generate any code here.
 	 *
 	 * Largest permitted alignment is 256 bytes due to the way we
 	 * handle the index determination for the smaller caches.
 	 *
 	 * Make sure that nothing crazy happens if someone starts tinkering
 	 * around with ARCH_KMALLOC_MINALIGN
 	 */
 	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
 		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
 	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
 		int elem = size_index_elem(i);
 		if (elem >= ARRAY_SIZE(size_index))
 			break;
 		size_index[elem] = KMALLOC_SHIFT_LOW;
 	}
 	if (KMALLOC_MIN_SIZE >= 64) {
 		/*
 		 * The 96 byte size cache is not used if the alignment
 		 * is 64 byte.
 		 */
 		for (i = 64 + 8; i <= 96; i += 8)
 			size_index[size_index_elem(i)] = 7;
 	}
 	if (KMALLOC_MIN_SIZE >= 128) {
 		/*
 		 * The 192 byte sized cache is not used if the alignment
 		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
 		 * instead.
 		 */
 		for (i = 128 + 8; i <= 192; i += 8)
 			size_index[size_index_elem(i)] = 8;
 	}
 	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 		if (!kmalloc_caches[i]) {
 			kmalloc_caches[i] = create_kmalloc_cache(NULL,
 							1 << i, flags);
 		}
 		/*
 		 * Caches that are not of the two-to-the-power-of size.
 		 * These have to be created immediately after the
 		 * earlier power of two caches
 		 */
 		if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
 			kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
 		if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
 			kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
 	}
 	/* Kmalloc array is now usable */
 	slab_state = UP;
 	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
 		struct kmem_cache *s = kmalloc_caches[i];
 		char *n;
 		if (s) {
 			n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
 			BUG_ON(!n);
 			s->name = n;
 		}
 	}
 #ifdef CONFIG_ZONE_DMA
 	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
 		struct kmem_cache *s = kmalloc_caches[i];
 		if (s) {
 			int size = kmalloc_size(i);
 			char *n = kasprintf(GFP_NOWAIT,
 				 "dma-kmalloc-%d", size);
 			BUG_ON(!n);
 			kmalloc_dma_caches[i] = create_kmalloc_cache(n,
 				size, SLAB_CACHE_DMA | flags);
 		}
 	}
 #endif
 }
 #endif /* !CONFIG_SLOB */
 #ifdef CONFIG_TRACING
 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
 {
 	void *ret = kmalloc_order(size, flags, order);
 	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
 	return ret;
 }
 EXPORT_SYMBOL(kmalloc_order_trace);
 #endif
 #ifdef CONFIG_SLABINFO
 #ifdef CONFIG_SLAB
 #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
 #else
 #define SLABINFO_RIGHTS S_IRUSR
 #endif
 void print_slabinfo_header(struct seq_file *m)
 {
 	/*
 	 * Output format version, so at least we can change it
 	 * without _too_ many complaints.
 	 */
 #ifdef CONFIG_DEBUG_SLAB
 	seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
 	seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
 	seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
 		 "<objperslab> <pagesperslab>");
 	seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
 	seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #ifdef CONFIG_DEBUG_SLAB
 	seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
 		 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
 	seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
 	seq_putc(m, '\n');
 }
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	loff_t n = *pos;
 	mutex_lock(&slab_mutex);
 	if (!n)
 		print_slabinfo_header(m);
 	return seq_list_start(&slab_caches, *pos);
 }
 void *slab_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	return seq_list_next(p, &slab_caches, pos);
 }
 void slab_stop(struct seq_file *m, void *p)
 {
 	mutex_unlock(&slab_mutex);
 }
 static void
 memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 {
 	struct kmem_cache *c;
 	struct slabinfo sinfo;
 	int i;
 	if (!is_root_cache(s))
 		return;
 	for_each_memcg_cache_index(i) {
 		c = cache_from_memcg_idx(s, i);
 		if (!c)
 			continue;
 		memset(&sinfo, 0, sizeof(sinfo));
 		get_slabinfo(c, &sinfo);
 		info->active_slabs += sinfo.active_slabs;
 		info->num_slabs += sinfo.num_slabs;
 		info->shared_avail += sinfo.shared_avail;
 		info->active_objs += sinfo.active_objs;
 		info->num_objs += sinfo.num_objs;
 	}
 }
 int cache_show(struct kmem_cache *s, struct seq_file *m)
 {
 	struct slabinfo sinfo;
 	memset(&sinfo, 0, sizeof(sinfo));
 	get_slabinfo(s, &sinfo);
 	memcg_accumulate_slabinfo(s, &sinfo);
 	seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
 		   cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
 		   sinfo.objects_per_slab, (1 << sinfo.cache_order));
 	seq_printf(m, " : tunables %4u %4u %4u",