Eric Lee / smarc-ti-linux-kernel

1

/* memcontrol.c - Memory Controller

1

/* memcontrol.c - Memory Controller

2

*

2

*

3

* Copyright IBM Corporation, 2007

3

* Copyright IBM Corporation, 2007

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

4

* Author Balbir Singh <balbir@linux.vnet.ibm.com>

5

*

5

*

6

7

* Author: Pavel Emelianov <xemul@openvz.org>

7

* Author: Pavel Emelianov <xemul@openvz.org>

8

*

8

*

9

* Memory thresholds

9

* Memory thresholds

10

11

* Author: Kirill A. Shutemov

11

* Author: Kirill A. Shutemov

12

*

12

*

13

* Kernel Memory Controller

13

* Kernel Memory Controller

14

15

* Authors: Glauber Costa and Suleiman Souhlal

15

* Authors: Glauber Costa and Suleiman Souhlal

16

*

16

*

17

* This program is free software; you can redistribute it and/or modify

17

* This program is free software; you can redistribute it and/or modify

18

* it under the terms of the GNU General Public License as published by

18

* it under the terms of the GNU General Public License as published by

19

* the Free Software Foundation; either version 2 of the License, or

19

* the Free Software Foundation; either version 2 of the License, or

20

* (at your option) any later version.

20

* (at your option) any later version.

21

*

21

*

22

* This program is distributed in the hope that it will be useful,

22

* This program is distributed in the hope that it will be useful,

23

* but WITHOUT ANY WARRANTY; without even the implied warranty of

23

* but WITHOUT ANY WARRANTY; without even the implied warranty of

24

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

24

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

25

* GNU General Public License for more details.

25

* GNU General Public License for more details.

26

*/

26

*/

27

28

#include <linux/page_counter.h>

28

#include <linux/page_counter.h>

29

#include <linux/memcontrol.h>

29

#include <linux/memcontrol.h>

30

#include <linux/cgroup.h>

30

#include <linux/cgroup.h>

31

#include <linux/mm.h>

31

#include <linux/mm.h>

32

#include <linux/hugetlb.h>

32

#include <linux/hugetlb.h>

33

#include <linux/pagemap.h>

33

#include <linux/pagemap.h>

34

#include <linux/smp.h>

34

#include <linux/smp.h>

35

#include <linux/page-flags.h>

35

#include <linux/page-flags.h>

36

#include <linux/backing-dev.h>

36

#include <linux/backing-dev.h>

37

#include <linux/bit_spinlock.h>

37

#include <linux/bit_spinlock.h>

38

#include <linux/rcupdate.h>

38

#include <linux/rcupdate.h>

39

#include <linux/limits.h>

39

#include <linux/limits.h>

40

#include <linux/export.h>

40

#include <linux/export.h>

41

#include <linux/mutex.h>

41

#include <linux/mutex.h>

42

#include <linux/rbtree.h>

42

#include <linux/rbtree.h>

43

#include <linux/slab.h>

43

#include <linux/slab.h>

44

#include <linux/swap.h>

44

#include <linux/swap.h>

45

#include <linux/swapops.h>

45

#include <linux/swapops.h>

46

#include <linux/spinlock.h>

46

#include <linux/spinlock.h>

47

#include <linux/eventfd.h>

47

#include <linux/eventfd.h>

48

#include <linux/poll.h>

48

#include <linux/poll.h>

49

#include <linux/sort.h>

49

#include <linux/sort.h>

50

#include <linux/fs.h>

50

#include <linux/fs.h>

51

#include <linux/seq_file.h>

51

#include <linux/seq_file.h>

52

#include <linux/vmpressure.h>

52

#include <linux/vmpressure.h>

53

#include <linux/mm_inline.h>

53

#include <linux/mm_inline.h>

54

#include <linux/page_cgroup.h>

54

#include <linux/page_cgroup.h>

55

#include <linux/cpu.h>

55

#include <linux/cpu.h>

56

#include <linux/oom.h>

56

#include <linux/oom.h>

57

#include <linux/lockdep.h>

57

#include <linux/lockdep.h>

58

#include <linux/file.h>

58

#include <linux/file.h>

59

#include "internal.h"

59

#include "internal.h"

60

#include <net/sock.h>

60

#include <net/sock.h>

61

#include <net/ip.h>

61

#include <net/ip.h>

62

#include <net/tcp_memcontrol.h>

62

#include <net/tcp_memcontrol.h>

63

#include "slab.h"

63

#include "slab.h"

64

65

#include <asm/uaccess.h>

65

#include <asm/uaccess.h>

66

67

#include <trace/events/vmscan.h>

67

#include <trace/events/vmscan.h>

68

69

struct cgroup_subsys memory_cgrp_subsys __read_mostly;

69

struct cgroup_subsys memory_cgrp_subsys __read_mostly;

70

EXPORT_SYMBOL(memory_cgrp_subsys);

70

EXPORT_SYMBOL(memory_cgrp_subsys);

71

72

#define MEM_CGROUP_RECLAIM_RETRIES 5

72

#define MEM_CGROUP_RECLAIM_RETRIES 5

73

static struct mem_cgroup *root_mem_cgroup __read_mostly;

73

static struct mem_cgroup *root_mem_cgroup __read_mostly;

74

75

#ifdef CONFIG_MEMCG_SWAP

75

#ifdef CONFIG_MEMCG_SWAP

76

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

76

/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */

77

int do_swap_account __read_mostly;

77

int do_swap_account __read_mostly;

78

79

/* for remember boot option*/

79

/* for remember boot option*/

80

#ifdef CONFIG_MEMCG_SWAP_ENABLED

80

#ifdef CONFIG_MEMCG_SWAP_ENABLED

81

static int really_do_swap_account __initdata = 1;

81

static int really_do_swap_account __initdata = 1;

82

#else

82

#else

83

static int really_do_swap_account __initdata;

83

static int really_do_swap_account __initdata;

84

#endif

84

#endif

85

86

#else

86

#else

87

#define do_swap_account 0

87

#define do_swap_account 0

88

#endif

88

#endif

89

90

91

static const char * const mem_cgroup_stat_names[] = {

91

static const char * const mem_cgroup_stat_names[] = {

92

"cache",

92

"cache",

93

"rss",

93

"rss",

94

"rss_huge",

94

"rss_huge",

95

"mapped_file",

95

"mapped_file",

96

"writeback",

96

"writeback",

97

"swap",

97

"swap",

98

};

98

};

99

100

enum mem_cgroup_events_index {

100

enum mem_cgroup_events_index {

101

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

101

MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */

102

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

102

MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */

103

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

103

MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */

104

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

104

MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */

105

MEM_CGROUP_EVENTS_NSTATS,

105

MEM_CGROUP_EVENTS_NSTATS,

106

};

106

};

107

108

static const char * const mem_cgroup_events_names[] = {

108

static const char * const mem_cgroup_events_names[] = {

109

"pgpgin",

109

"pgpgin",

110

"pgpgout",

110

"pgpgout",

111

"pgfault",

111

"pgfault",

112

"pgmajfault",

112

"pgmajfault",

113

};

113

};

114

115

static const char * const mem_cgroup_lru_names[] = {

115

static const char * const mem_cgroup_lru_names[] = {

116

"inactive_anon",

116

"inactive_anon",

117

"active_anon",

117

"active_anon",

118

"inactive_file",

118

"inactive_file",

119

"active_file",

119

"active_file",

120

"unevictable",

120

"unevictable",

121

};

121

};

122

123

/*

123

/*

124

* Per memcg event counter is incremented at every pagein/pageout. With THP,

124

* Per memcg event counter is incremented at every pagein/pageout. With THP,

125

* it will be incremated by the number of pages. This counter is used for

125

* it will be incremated by the number of pages. This counter is used for

126

* for trigger some periodic events. This is straightforward and better

126

* for trigger some periodic events. This is straightforward and better

127

* than using jiffies etc. to handle periodic memcg event.

127

* than using jiffies etc. to handle periodic memcg event.

128

*/

128

*/

129

enum mem_cgroup_events_target {

129

enum mem_cgroup_events_target {

130

MEM_CGROUP_TARGET_THRESH,

130

MEM_CGROUP_TARGET_THRESH,

131

MEM_CGROUP_TARGET_SOFTLIMIT,

131

MEM_CGROUP_TARGET_SOFTLIMIT,

132

MEM_CGROUP_TARGET_NUMAINFO,

132

MEM_CGROUP_TARGET_NUMAINFO,

133

MEM_CGROUP_NTARGETS,

133

MEM_CGROUP_NTARGETS,

134

};

134

};

135

#define THRESHOLDS_EVENTS_TARGET 128

135

#define THRESHOLDS_EVENTS_TARGET 128

136

#define SOFTLIMIT_EVENTS_TARGET 1024

136

#define SOFTLIMIT_EVENTS_TARGET 1024

137

#define NUMAINFO_EVENTS_TARGET 1024

137

#define NUMAINFO_EVENTS_TARGET 1024

138

139

struct mem_cgroup_stat_cpu {

139

struct mem_cgroup_stat_cpu {

140

long count[MEM_CGROUP_STAT_NSTATS];

140

long count[MEM_CGROUP_STAT_NSTATS];

141

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

141

unsigned long events[MEM_CGROUP_EVENTS_NSTATS];

142

unsigned long nr_page_events;

142

unsigned long nr_page_events;

143

unsigned long targets[MEM_CGROUP_NTARGETS];

143

unsigned long targets[MEM_CGROUP_NTARGETS];

144

};

144

};

145

146

struct reclaim_iter {

146

struct reclaim_iter {

147

struct mem_cgroup *position;

147

struct mem_cgroup *position;

148

/* scan generation, increased every round-trip */

148

/* scan generation, increased every round-trip */

149

unsigned int generation;

149

unsigned int generation;

150

};

150

};

151

152

/*

152

/*

153

* per-zone information in memory controller.

153

* per-zone information in memory controller.

154

*/

154

*/

155

struct mem_cgroup_per_zone {

155

struct mem_cgroup_per_zone {

156

struct lruvec lruvec;

156

struct lruvec lruvec;

157

unsigned long lru_size[NR_LRU_LISTS];

157

unsigned long lru_size[NR_LRU_LISTS];

158

159

struct reclaim_iter iter[DEF_PRIORITY + 1];

159

struct reclaim_iter iter[DEF_PRIORITY + 1];

160

161

struct rb_node tree_node; /* RB tree node */

161

struct rb_node tree_node; /* RB tree node */

162

unsigned long usage_in_excess;/* Set to the value by which */

162

unsigned long usage_in_excess;/* Set to the value by which */

163

/* the soft limit is exceeded*/

163

/* the soft limit is exceeded*/

164

bool on_tree;

164

bool on_tree;

165

struct mem_cgroup *memcg; /* Back pointer, we cannot */

165

struct mem_cgroup *memcg; /* Back pointer, we cannot */

166

/* use container_of */

166

/* use container_of */

167

};

167

};

168

169

struct mem_cgroup_per_node {

169

struct mem_cgroup_per_node {

170

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

170

struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];

171

};

171

};

172

173

/*

173

/*

174

* Cgroups above their limits are maintained in a RB-Tree, independent of

174

* Cgroups above their limits are maintained in a RB-Tree, independent of

175

* their hierarchy representation

175

* their hierarchy representation

176

*/

176

*/

177

178

struct mem_cgroup_tree_per_zone {

178

struct mem_cgroup_tree_per_zone {

179

struct rb_root rb_root;

179

struct rb_root rb_root;

180

spinlock_t lock;

180

spinlock_t lock;

181

};

181

};

182

183

struct mem_cgroup_tree_per_node {

183

struct mem_cgroup_tree_per_node {

184

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

184

struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];

185

};

185

};

186

187

struct mem_cgroup_tree {

187

struct mem_cgroup_tree {

188

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

188

struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];

189

};

189

};

190

191

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

191

static struct mem_cgroup_tree soft_limit_tree __read_mostly;

192

193

struct mem_cgroup_threshold {

193

struct mem_cgroup_threshold {

194

struct eventfd_ctx *eventfd;

194

struct eventfd_ctx *eventfd;

195

unsigned long threshold;

195

unsigned long threshold;

196

};

196

};

197

198

/* For threshold */

198

/* For threshold */

199

struct mem_cgroup_threshold_ary {

199

struct mem_cgroup_threshold_ary {

200

/* An array index points to threshold just below or equal to usage. */

200

/* An array index points to threshold just below or equal to usage. */

201

int current_threshold;

201

int current_threshold;

202

/* Size of entries[] */

202

/* Size of entries[] */

203

unsigned int size;

203

unsigned int size;

204

/* Array of thresholds */

204

/* Array of thresholds */

205

struct mem_cgroup_threshold entries[0];

205

struct mem_cgroup_threshold entries[0];

206

};

206

};

207

208

struct mem_cgroup_thresholds {

208

struct mem_cgroup_thresholds {

209

/* Primary thresholds array */

209

/* Primary thresholds array */

210

struct mem_cgroup_threshold_ary *primary;

210

struct mem_cgroup_threshold_ary *primary;

211

/*

211

/*

212

* Spare threshold array.

212

* Spare threshold array.

213

* This is needed to make mem_cgroup_unregister_event() "never fail".

213

* This is needed to make mem_cgroup_unregister_event() "never fail".

214

* It must be able to store at least primary->size - 1 entries.

214

* It must be able to store at least primary->size - 1 entries.

215

*/

215

*/

216

struct mem_cgroup_threshold_ary *spare;

216

struct mem_cgroup_threshold_ary *spare;

217

};

217

};

218

219

/* for OOM */

219

/* for OOM */

220

struct mem_cgroup_eventfd_list {

220

struct mem_cgroup_eventfd_list {

221

struct list_head list;

221

struct list_head list;

222

struct eventfd_ctx *eventfd;

222

struct eventfd_ctx *eventfd;

223

};

223

};

224

225

/*

225

/*

226

* cgroup_event represents events which userspace want to receive.

226

* cgroup_event represents events which userspace want to receive.

227

*/

227

*/

228

struct mem_cgroup_event {

228

struct mem_cgroup_event {

229

/*

229

/*

230

* memcg which the event belongs to.

230

* memcg which the event belongs to.

231

*/

231

*/

232

struct mem_cgroup *memcg;

232

struct mem_cgroup *memcg;

233

/*

233

/*

234

* eventfd to signal userspace about the event.

234

* eventfd to signal userspace about the event.

235

*/

235

*/

236

struct eventfd_ctx *eventfd;

236

struct eventfd_ctx *eventfd;

237

/*

237

/*

238

* Each of these stored in a list by the cgroup.

238

* Each of these stored in a list by the cgroup.

239

*/

239

*/

240

struct list_head list;

240

struct list_head list;

241

/*

241

/*

242

* register_event() callback will be used to add new userspace

242

* register_event() callback will be used to add new userspace

243

* waiter for changes related to this event. Use eventfd_signal()

243

* waiter for changes related to this event. Use eventfd_signal()

244

* on eventfd to send notification to userspace.

244

* on eventfd to send notification to userspace.

245

*/

245

*/

246

int (*register_event)(struct mem_cgroup *memcg,

246

int (*register_event)(struct mem_cgroup *memcg,

247

struct eventfd_ctx *eventfd, const char *args);

247

struct eventfd_ctx *eventfd, const char *args);

248

/*

248

/*

249

* unregister_event() callback will be called when userspace closes

249

* unregister_event() callback will be called when userspace closes

250

* the eventfd or on cgroup removing. This callback must be set,

250

* the eventfd or on cgroup removing. This callback must be set,

251

* if you want provide notification functionality.

251

* if you want provide notification functionality.

252

*/

252

*/

253

void (*unregister_event)(struct mem_cgroup *memcg,

253

void (*unregister_event)(struct mem_cgroup *memcg,

254

struct eventfd_ctx *eventfd);

254

struct eventfd_ctx *eventfd);

255

/*

255

/*

256

* All fields below needed to unregister event when

256

* All fields below needed to unregister event when

257

* userspace closes eventfd.

257

* userspace closes eventfd.

258

*/

258

*/

259

poll_table pt;

259

poll_table pt;

260

wait_queue_head_t *wqh;

260

wait_queue_head_t *wqh;

261

wait_queue_t wait;

261

wait_queue_t wait;

262

struct work_struct remove;

262

struct work_struct remove;

263

};

263

};

264

265

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

265

static void mem_cgroup_threshold(struct mem_cgroup *memcg);

266

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

266

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);

267

268

/*

268

/*

269

* The memory controller data structure. The memory controller controls both

269

* The memory controller data structure. The memory controller controls both

270

* page cache and RSS per cgroup. We would eventually like to provide

270

* page cache and RSS per cgroup. We would eventually like to provide

271

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

271

* statistics based on the statistics developed by Rik Van Riel for clock-pro,

272

* to help the administrator determine what knobs to tune.

272

* to help the administrator determine what knobs to tune.

273

*

273

*

274

* TODO: Add a water mark for the memory controller. Reclaim will begin when

274

* TODO: Add a water mark for the memory controller. Reclaim will begin when

275

* we hit the water mark. May be even add a low water mark, such that

275

* we hit the water mark. May be even add a low water mark, such that

276

* no reclaim occurs from a cgroup at it's low water mark, this is

276

* no reclaim occurs from a cgroup at it's low water mark, this is

277

* a feature that will be implemented much later in the future.

277

* a feature that will be implemented much later in the future.

278

*/

278

*/

279

struct mem_cgroup {

279

struct mem_cgroup {

280

struct cgroup_subsys_state css;

280

struct cgroup_subsys_state css;

281

282

/* Accounted resources */

282

/* Accounted resources */

283

struct page_counter memory;

283

struct page_counter memory;

284

struct page_counter memsw;

284

struct page_counter memsw;

285

struct page_counter kmem;

285

struct page_counter kmem;

286

287

unsigned long soft_limit;

287

unsigned long soft_limit;

288

289

/* vmpressure notifications */

289

/* vmpressure notifications */

290

struct vmpressure vmpressure;

290

struct vmpressure vmpressure;

291

292

/* css_online() has been completed */

292

/* css_online() has been completed */

293

int initialized;

293

int initialized;

294

295

/*

295

/*

296

* Should the accounting and control be hierarchical, per subtree?

296

* Should the accounting and control be hierarchical, per subtree?

297

*/

297

*/

298

bool use_hierarchy;

298

bool use_hierarchy;

299

unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */

299

unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */

300

301

bool oom_lock;

301

bool oom_lock;

302

atomic_t under_oom;

302

atomic_t under_oom;

303

atomic_t oom_wakeups;

303

atomic_t oom_wakeups;

304

305

int swappiness;

305

int swappiness;

306

/* OOM-Killer disable */

306

/* OOM-Killer disable */

307

int oom_kill_disable;

307

int oom_kill_disable;

308

309

/* protect arrays of thresholds */

309

/* protect arrays of thresholds */

310

struct mutex thresholds_lock;

310

struct mutex thresholds_lock;

311

312

/* thresholds for memory usage. RCU-protected */

312

/* thresholds for memory usage. RCU-protected */

313

struct mem_cgroup_thresholds thresholds;

313

struct mem_cgroup_thresholds thresholds;

314

315

/* thresholds for mem+swap usage. RCU-protected */

315

/* thresholds for mem+swap usage. RCU-protected */

316

struct mem_cgroup_thresholds memsw_thresholds;

316

struct mem_cgroup_thresholds memsw_thresholds;

317

318

/* For oom notifier event fd */

318

/* For oom notifier event fd */

319

struct list_head oom_notify;

319

struct list_head oom_notify;

320

321

/*

321

/*

322

* Should we move charges of a task when a task is moved into this

322

* Should we move charges of a task when a task is moved into this

323

* mem_cgroup ? And what type of charges should we move ?

323

* mem_cgroup ? And what type of charges should we move ?

324

*/

324

*/

325

unsigned long move_charge_at_immigrate;

325

unsigned long move_charge_at_immigrate;

326

/*

326

/*

327

* set > 0 if pages under this cgroup are moving to other cgroup.

327

* set > 0 if pages under this cgroup are moving to other cgroup.

328

*/

328

*/

329

atomic_t moving_account;

329

atomic_t moving_account;

330

/* taken only while moving_account > 0 */

330

/* taken only while moving_account > 0 */

331

spinlock_t move_lock;

331

spinlock_t move_lock;

332

/*

332

/*

333

* percpu counter.

333

* percpu counter.

334

*/

334

*/

335

struct mem_cgroup_stat_cpu __percpu *stat;

335

struct mem_cgroup_stat_cpu __percpu *stat;

336

/*

336

/*

337

* used when a cpu is offlined or other synchronizations

337

* used when a cpu is offlined or other synchronizations

338

* See mem_cgroup_read_stat().

338

* See mem_cgroup_read_stat().

339

*/

339

*/

340

struct mem_cgroup_stat_cpu nocpu_base;

340

struct mem_cgroup_stat_cpu nocpu_base;

341

spinlock_t pcp_counter_lock;

341

spinlock_t pcp_counter_lock;

342

343

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)

343

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)

344

struct cg_proto tcp_mem;

344

struct cg_proto tcp_mem;

345

#endif

345

#endif

346

#if defined(CONFIG_MEMCG_KMEM)

346

#if defined(CONFIG_MEMCG_KMEM)

347

/* analogous to slab_common's slab_caches list, but per-memcg;

347

/* analogous to slab_common's slab_caches list, but per-memcg;

348

* protected by memcg_slab_mutex */

348

* protected by memcg_slab_mutex */

349

struct list_head memcg_slab_caches;

349

struct list_head memcg_slab_caches;

350

/* Index in the kmem_cache->memcg_params->memcg_caches array */

350

/* Index in the kmem_cache->memcg_params->memcg_caches array */

351

int kmemcg_id;

351

int kmemcg_id;

352

#endif

352

#endif

353

354

int last_scanned_node;

354

int last_scanned_node;

355

#if MAX_NUMNODES > 1

355

#if MAX_NUMNODES > 1

356

nodemask_t scan_nodes;

356

nodemask_t scan_nodes;

357

atomic_t numainfo_events;

357

atomic_t numainfo_events;

358

atomic_t numainfo_updating;

358

atomic_t numainfo_updating;

359

#endif

359

#endif

360

361

/* List of events which userspace want to receive */

361

/* List of events which userspace want to receive */

362

struct list_head event_list;

362

struct list_head event_list;

363

spinlock_t event_list_lock;

363

spinlock_t event_list_lock;

364

365

struct mem_cgroup_per_node *nodeinfo[0];

365

struct mem_cgroup_per_node *nodeinfo[0];

366

/* WARNING: nodeinfo must be the last member here */

366

/* WARNING: nodeinfo must be the last member here */

367

};

367

};

368

369

/* internal only representation about the status of kmem accounting. */

369

/* internal only representation about the status of kmem accounting. */

370

enum {

370

enum {

371

KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */

371

KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */

372

KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */

372

KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */

373

};

373

};

374

375

#ifdef CONFIG_MEMCG_KMEM

375

#ifdef CONFIG_MEMCG_KMEM

376

static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)

376

static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)

377

{

377

{

378

set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

378

set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

379

}

379

}

380

381

static bool memcg_kmem_is_active(struct mem_cgroup *memcg)

381

static bool memcg_kmem_is_active(struct mem_cgroup *memcg)

382

{

382

{

383

return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

383

return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);

384

}

384

}

385

386

static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)

386

static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)

387

{

387

{

388

/*

388

/*

389

* Our caller must use css_get() first, because memcg_uncharge_kmem()

389

* Our caller must use css_get() first, because memcg_uncharge_kmem()

390

* will call css_put() if it sees the memcg is dead.

390

* will call css_put() if it sees the memcg is dead.

391

*/

391

*/

392

smp_wmb();

392

smp_wmb();

393

if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))

393

if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))

394

set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);

394

set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);

395

}

395

}

396

397

static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)

397

static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)

398

{

398

{

399

return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,

399

return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,

400

&memcg->kmem_account_flags);

400

&memcg->kmem_account_flags);

401

}

401

}

402

#endif

402

#endif

403

404

/* Stuffs for move charges at task migration. */

404

/* Stuffs for move charges at task migration. */

405

/*

405

/*

406

* Types of charges to be moved. "move_charge_at_immitgrate" and

406

* Types of charges to be moved. "move_charge_at_immitgrate" and

407

* "immigrate_flags" are treated as a left-shifted bitmap of these types.

407

* "immigrate_flags" are treated as a left-shifted bitmap of these types.

408

*/

408

*/

409

enum move_type {

409

enum move_type {

410

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

410

MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */

411

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

411

MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */

412

NR_MOVE_TYPE,

412

NR_MOVE_TYPE,

413

};

413

};

414

415

/* "mc" and its members are protected by cgroup_mutex */

415

/* "mc" and its members are protected by cgroup_mutex */

416

static struct move_charge_struct {

416

static struct move_charge_struct {

417

spinlock_t lock; /* for from, to */

417

spinlock_t lock; /* for from, to */

418

struct mem_cgroup *from;

418

struct mem_cgroup *from;

419

struct mem_cgroup *to;

419

struct mem_cgroup *to;

420

unsigned long immigrate_flags;

420

unsigned long immigrate_flags;

421

unsigned long precharge;

421

unsigned long precharge;

422

unsigned long moved_charge;

422

unsigned long moved_charge;

423

unsigned long moved_swap;

423

unsigned long moved_swap;

424

struct task_struct *moving_task; /* a task moving charges */

424

struct task_struct *moving_task; /* a task moving charges */

425

wait_queue_head_t waitq; /* a waitq for other context */

425

wait_queue_head_t waitq; /* a waitq for other context */

426

} mc = {

426

} mc = {

427

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

427

.lock = __SPIN_LOCK_UNLOCKED(mc.lock),

428

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

428

.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),

429

};

429

};

430

431

static bool move_anon(void)

431

static bool move_anon(void)

432

{

432

{

433

return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);

433

return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);

434

}

434

}

435

436

static bool move_file(void)

436

static bool move_file(void)

437

{

437

{

438

return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);

438

return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);

439

}

439

}

440

441

/*

441

/*

442

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

442

* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft

443

* limit reclaim to prevent infinite loops, if they ever occur.

443

* limit reclaim to prevent infinite loops, if they ever occur.

444

*/

444

*/

445

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

445

#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100

446

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

446

#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2

447

448

enum charge_type {

448

enum charge_type {

449

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

449

MEM_CGROUP_CHARGE_TYPE_CACHE = 0,

450

MEM_CGROUP_CHARGE_TYPE_ANON,

450

MEM_CGROUP_CHARGE_TYPE_ANON,

451

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

451

MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */

452

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

452

MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */

453

NR_CHARGE_TYPE,

453

NR_CHARGE_TYPE,

454

};

454

};

455

456

/* for encoding cft->private value on file */

456

/* for encoding cft->private value on file */

457

enum res_type {

457

enum res_type {

458

_MEM,

458

_MEM,

459

_MEMSWAP,

459

_MEMSWAP,

460

_OOM_TYPE,

460

_OOM_TYPE,

461

_KMEM,

461

_KMEM,

462

};

462

};

463

464

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

464

#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))

465

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

465

#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)

466

#define MEMFILE_ATTR(val) ((val) & 0xffff)

466

#define MEMFILE_ATTR(val) ((val) & 0xffff)

467

/* Used for OOM nofiier */

467

/* Used for OOM nofiier */

468

#define OOM_CONTROL (0)

468

#define OOM_CONTROL (0)

469

470

/*

470

/*

471

* The memcg_create_mutex will be held whenever a new cgroup is created.

471

* The memcg_create_mutex will be held whenever a new cgroup is created.

472

* As a consequence, any change that needs to protect against new child cgroups

472

* As a consequence, any change that needs to protect against new child cgroups

473

* appearing has to hold it as well.

473

* appearing has to hold it as well.

474

*/

474

*/

475

static DEFINE_MUTEX(memcg_create_mutex);

475

static DEFINE_MUTEX(memcg_create_mutex);

476

477

struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)

477

struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)

478

{

478

{

479

return s ? container_of(s, struct mem_cgroup, css) : NULL;

479

return s ? container_of(s, struct mem_cgroup, css) : NULL;

480

}

480

}

481

482

/* Some nice accessors for the vmpressure. */

482

/* Some nice accessors for the vmpressure. */

483

struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)

483

struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)

484

{

484

{

485

if (!memcg)

485

if (!memcg)

486

memcg = root_mem_cgroup;

486

memcg = root_mem_cgroup;

487

return &memcg->vmpressure;

487

return &memcg->vmpressure;

488

}

488

}

489

490

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)

490

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)

491

{

491

{

492

return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;

492

return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;

493

}

493

}

494

495

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

495

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)

496

{

496

{

497

return (memcg == root_mem_cgroup);

497

return (memcg == root_mem_cgroup);

498

}

498

}

499

500

/*

500

/*

501

* We restrict the id in the range of [1, 65535], so it can fit into

501

* We restrict the id in the range of [1, 65535], so it can fit into

502

* an unsigned short.

502

* an unsigned short.

503

*/

503

*/

504

#define MEM_CGROUP_ID_MAX USHRT_MAX

504

#define MEM_CGROUP_ID_MAX USHRT_MAX

505

506

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)

506

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)

507

{

507

{

508

return memcg->css.id;

508

return memcg->css.id;

509

}

509

}

510

511

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)

511

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)

512

{

512

{

513

struct cgroup_subsys_state *css;

513

struct cgroup_subsys_state *css;

514

515

css = css_from_id(id, &memory_cgrp_subsys);

515

css = css_from_id(id, &memory_cgrp_subsys);

516

return mem_cgroup_from_css(css);

516

return mem_cgroup_from_css(css);

517

}

517

}

518

519

/* Writing them here to avoid exposing memcg's inner layout */

519

/* Writing them here to avoid exposing memcg's inner layout */

520

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

520

#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)

521

522

void sock_update_memcg(struct sock *sk)

522

void sock_update_memcg(struct sock *sk)

523

{

523

{

524

if (mem_cgroup_sockets_enabled) {

524

if (mem_cgroup_sockets_enabled) {

525

struct mem_cgroup *memcg;

525

struct mem_cgroup *memcg;

526

struct cg_proto *cg_proto;

526

struct cg_proto *cg_proto;

527

528

BUG_ON(!sk->sk_prot->proto_cgroup);

528

BUG_ON(!sk->sk_prot->proto_cgroup);

529

530

/* Socket cloning can throw us here with sk_cgrp already

530

/* Socket cloning can throw us here with sk_cgrp already

531

* filled. It won't however, necessarily happen from

531

* filled. It won't however, necessarily happen from

532

* process context. So the test for root memcg given

532

* process context. So the test for root memcg given

533

* the current task's memcg won't help us in this case.

533

* the current task's memcg won't help us in this case.

534

*

534

*

535

* Respecting the original socket's memcg is a better

535

* Respecting the original socket's memcg is a better

536

* decision in this case.

536

* decision in this case.

537

*/

537

*/

538

if (sk->sk_cgrp) {

538

if (sk->sk_cgrp) {

539

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

539

BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));

540

css_get(&sk->sk_cgrp->memcg->css);

540

css_get(&sk->sk_cgrp->memcg->css);

541

return;

541

return;

542

}

542

}

543

544

rcu_read_lock();

544

rcu_read_lock();

545

memcg = mem_cgroup_from_task(current);

545

memcg = mem_cgroup_from_task(current);

546

cg_proto = sk->sk_prot->proto_cgroup(memcg);

546

cg_proto = sk->sk_prot->proto_cgroup(memcg);

547

if (!mem_cgroup_is_root(memcg) &&

547

if (!mem_cgroup_is_root(memcg) &&

548

memcg_proto_active(cg_proto) &&

548

memcg_proto_active(cg_proto) &&

549

css_tryget_online(&memcg->css)) {

549

css_tryget_online(&memcg->css)) {

550

sk->sk_cgrp = cg_proto;

550

sk->sk_cgrp = cg_proto;

551

}

551

}

552

rcu_read_unlock();

552

rcu_read_unlock();

553

}

553

}

554

}

554

}

555

EXPORT_SYMBOL(sock_update_memcg);

555

EXPORT_SYMBOL(sock_update_memcg);

556

557

void sock_release_memcg(struct sock *sk)

557

void sock_release_memcg(struct sock *sk)

558

{

558

{

559

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

559

if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {

560

struct mem_cgroup *memcg;

560

struct mem_cgroup *memcg;

561

WARN_ON(!sk->sk_cgrp->memcg);

561

WARN_ON(!sk->sk_cgrp->memcg);

562

memcg = sk->sk_cgrp->memcg;

562

memcg = sk->sk_cgrp->memcg;

563

css_put(&sk->sk_cgrp->memcg->css);

563

css_put(&sk->sk_cgrp->memcg->css);

564

}

564

}

565

}

565

}

566

567

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

567

struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)

568

{

568

{

569

if (!memcg || mem_cgroup_is_root(memcg))

569

if (!memcg || mem_cgroup_is_root(memcg))

570

return NULL;

570

return NULL;

571

572

return &memcg->tcp_mem;

572

return &memcg->tcp_mem;

573

}

573

}

574

EXPORT_SYMBOL(tcp_proto_cgroup);

574

EXPORT_SYMBOL(tcp_proto_cgroup);

575

576

static void disarm_sock_keys(struct mem_cgroup *memcg)

576

static void disarm_sock_keys(struct mem_cgroup *memcg)

577

{

577

{

578

if (!memcg_proto_activated(&memcg->tcp_mem))

578

if (!memcg_proto_activated(&memcg->tcp_mem))

579

return;

579

return;

580

static_key_slow_dec(&memcg_socket_limit_enabled);

580

static_key_slow_dec(&memcg_socket_limit_enabled);

581

}

581

}

582

#else

582

#else

583

static void disarm_sock_keys(struct mem_cgroup *memcg)

583

static void disarm_sock_keys(struct mem_cgroup *memcg)

584

{

584

{

585

}

585

}

586

#endif

586

#endif

587

588

#ifdef CONFIG_MEMCG_KMEM

588

#ifdef CONFIG_MEMCG_KMEM

589

/*

589

/*

590

* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.

590

* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.

591

* The main reason for not using cgroup id for this:

591

* The main reason for not using cgroup id for this:

592

* this works better in sparse environments, where we have a lot of memcgs,

592

* this works better in sparse environments, where we have a lot of memcgs,

593

* but only a few kmem-limited. Or also, if we have, for instance, 200

593

* but only a few kmem-limited. Or also, if we have, for instance, 200

594

* memcgs, and none but the 200th is kmem-limited, we'd have to have a

594

* memcgs, and none but the 200th is kmem-limited, we'd have to have a

595

* 200 entry array for that.

595

* 200 entry array for that.

596

*

596

*

597

* The current size of the caches array is stored in

597

* The current size of the caches array is stored in

598

* memcg_limited_groups_array_size. It will double each time we have to

598

* memcg_limited_groups_array_size. It will double each time we have to

599

* increase it.

599

* increase it.

600

*/

600

*/

601

static DEFINE_IDA(kmem_limited_groups);

601

static DEFINE_IDA(kmem_limited_groups);

602

int memcg_limited_groups_array_size;

602

int memcg_limited_groups_array_size;

603

604

/*

604

/*

605

* MIN_SIZE is different than 1, because we would like to avoid going through

605

* MIN_SIZE is different than 1, because we would like to avoid going through

606

* the alloc/free process all the time. In a small machine, 4 kmem-limited

606

* the alloc/free process all the time. In a small machine, 4 kmem-limited

607

* cgroups is a reasonable guess. In the future, it could be a parameter or

607

* cgroups is a reasonable guess. In the future, it could be a parameter or

608

* tunable, but that is strictly not necessary.

608

* tunable, but that is strictly not necessary.

609

*

609

*

610

* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get

610

* MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get

611

* this constant directly from cgroup, but it is understandable that this is

611

* this constant directly from cgroup, but it is understandable that this is

612

* better kept as an internal representation in cgroup.c. In any case, the

612

* better kept as an internal representation in cgroup.c. In any case, the

613

* cgrp_id space is not getting any smaller, and we don't have to necessarily

613

* cgrp_id space is not getting any smaller, and we don't have to necessarily

614

* increase ours as well if it increases.

614

* increase ours as well if it increases.

615

*/

615

*/

616

#define MEMCG_CACHES_MIN_SIZE 4

616

#define MEMCG_CACHES_MIN_SIZE 4

617

#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX

617

#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX

618

619

/*

619

/*

620

* A lot of the calls to the cache allocation functions are expected to be

620

* A lot of the calls to the cache allocation functions are expected to be

621

* inlined by the compiler. Since the calls to memcg_kmem_get_cache are

621

* inlined by the compiler. Since the calls to memcg_kmem_get_cache are

622

* conditional to this static branch, we'll have to allow modules that does

622

* conditional to this static branch, we'll have to allow modules that does

623

* kmem_cache_alloc and the such to see this symbol as well

623

* kmem_cache_alloc and the such to see this symbol as well

624

*/

624

*/

625

struct static_key memcg_kmem_enabled_key;

625

struct static_key memcg_kmem_enabled_key;

626

EXPORT_SYMBOL(memcg_kmem_enabled_key);

626

EXPORT_SYMBOL(memcg_kmem_enabled_key);

627

628

static void memcg_free_cache_id(int id);

628

static void memcg_free_cache_id(int id);

629

630

static void disarm_kmem_keys(struct mem_cgroup *memcg)

630

static void disarm_kmem_keys(struct mem_cgroup *memcg)

631

{

631

{

632

if (memcg_kmem_is_active(memcg)) {

632

if (memcg_kmem_is_active(memcg)) {

633

static_key_slow_dec(&memcg_kmem_enabled_key);

633

static_key_slow_dec(&memcg_kmem_enabled_key);

634

memcg_free_cache_id(memcg->kmemcg_id);

634

memcg_free_cache_id(memcg->kmemcg_id);

635

}

635

}

636

/*

636

/*

637

* This check can't live in kmem destruction function,

637

* This check can't live in kmem destruction function,

638

* since the charges will outlive the cgroup

638

* since the charges will outlive the cgroup

639

*/

639

*/

640

WARN_ON(page_counter_read(&memcg->kmem));

640

WARN_ON(page_counter_read(&memcg->kmem));

641

}

641

}

642

#else

642

#else

643

static void disarm_kmem_keys(struct mem_cgroup *memcg)

643

static void disarm_kmem_keys(struct mem_cgroup *memcg)

644

{

644

{

645

}

645

}

646

#endif /* CONFIG_MEMCG_KMEM */

646

#endif /* CONFIG_MEMCG_KMEM */

647

648

static void disarm_static_keys(struct mem_cgroup *memcg)

648

static void disarm_static_keys(struct mem_cgroup *memcg)

649

{

649

{

650

disarm_sock_keys(memcg);

650

disarm_sock_keys(memcg);

651

disarm_kmem_keys(memcg);

651

disarm_kmem_keys(memcg);

652

}

652

}

653

654

static void drain_all_stock_async(struct mem_cgroup *memcg);

654

static void drain_all_stock_async(struct mem_cgroup *memcg);

655

656

static struct mem_cgroup_per_zone *

656

static struct mem_cgroup_per_zone *

657

mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)

657

mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)

658

{

658

{

659

int nid = zone_to_nid(zone);

659

int nid = zone_to_nid(zone);

660

int zid = zone_idx(zone);

660

int zid = zone_idx(zone);

661

662

return &memcg->nodeinfo[nid]->zoneinfo[zid];

662

return &memcg->nodeinfo[nid]->zoneinfo[zid];

663

}

663

}

664

665

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

665

struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)

666

{

666

{

667

return &memcg->css;

667

return &memcg->css;

668

}

668

}

669

670

static struct mem_cgroup_per_zone *

670

static struct mem_cgroup_per_zone *

671

mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)

671

mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)

672

{

672

{

673

int nid = page_to_nid(page);

673

int nid = page_to_nid(page);

674

int zid = page_zonenum(page);

674

int zid = page_zonenum(page);

675

676

return &memcg->nodeinfo[nid]->zoneinfo[zid];

676

return &memcg->nodeinfo[nid]->zoneinfo[zid];

677

}

677

}

678

679

static struct mem_cgroup_tree_per_zone *

679

static struct mem_cgroup_tree_per_zone *

680

soft_limit_tree_node_zone(int nid, int zid)

680

soft_limit_tree_node_zone(int nid, int zid)

681

{

681

{

682

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

682

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

683

}

683

}

684

685

static struct mem_cgroup_tree_per_zone *

685

static struct mem_cgroup_tree_per_zone *

686

soft_limit_tree_from_page(struct page *page)

686

soft_limit_tree_from_page(struct page *page)

687

{

687

{

688

int nid = page_to_nid(page);

688

int nid = page_to_nid(page);

689

int zid = page_zonenum(page);

689

int zid = page_zonenum(page);

690

691

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

691

return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];

692

}

692

}

693

694

static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,

694

static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,

695

struct mem_cgroup_tree_per_zone *mctz,

695

struct mem_cgroup_tree_per_zone *mctz,

696

unsigned long new_usage_in_excess)

696

unsigned long new_usage_in_excess)

697

{

697

{

698

struct rb_node **p = &mctz->rb_root.rb_node;

698

struct rb_node **p = &mctz->rb_root.rb_node;

699

struct rb_node *parent = NULL;

699

struct rb_node *parent = NULL;

700

struct mem_cgroup_per_zone *mz_node;

700

struct mem_cgroup_per_zone *mz_node;

701

702

if (mz->on_tree)

702

if (mz->on_tree)

703

return;

703

return;

704

705

mz->usage_in_excess = new_usage_in_excess;

705

mz->usage_in_excess = new_usage_in_excess;

706

if (!mz->usage_in_excess)

706

if (!mz->usage_in_excess)

707

return;

707

return;

708

while (*p) {

708

while (*p) {

709

parent = *p;

709

parent = *p;

710

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

710

mz_node = rb_entry(parent, struct mem_cgroup_per_zone,

711

tree_node);

711

tree_node);

712

if (mz->usage_in_excess < mz_node->usage_in_excess)

712

if (mz->usage_in_excess < mz_node->usage_in_excess)

713

p = &(*p)->rb_left;

713

p = &(*p)->rb_left;

714

/*

714

/*

715

* We can't avoid mem cgroups that are over their soft

715

* We can't avoid mem cgroups that are over their soft

716

* limit by the same amount

716

* limit by the same amount

717

*/

717

*/

718

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

718

else if (mz->usage_in_excess >= mz_node->usage_in_excess)

719

p = &(*p)->rb_right;

719

p = &(*p)->rb_right;

720

}

720

}

721

rb_link_node(&mz->tree_node, parent, p);

721

rb_link_node(&mz->tree_node, parent, p);

722

rb_insert_color(&mz->tree_node, &mctz->rb_root);

722

rb_insert_color(&mz->tree_node, &mctz->rb_root);

723

mz->on_tree = true;

723

mz->on_tree = true;

724

}

724

}

725

726

static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,

726

static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,

727

struct mem_cgroup_tree_per_zone *mctz)

727

struct mem_cgroup_tree_per_zone *mctz)

728

{

728

{

729

if (!mz->on_tree)

729

if (!mz->on_tree)

730

return;

730

return;

731

rb_erase(&mz->tree_node, &mctz->rb_root);

731

rb_erase(&mz->tree_node, &mctz->rb_root);

732

mz->on_tree = false;

732

mz->on_tree = false;

733

}

733

}

734

735

static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,

735

static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,

736

struct mem_cgroup_tree_per_zone *mctz)

736

struct mem_cgroup_tree_per_zone *mctz)

737

{

737

{

738

unsigned long flags;

738

unsigned long flags;

739

740

spin_lock_irqsave(&mctz->lock, flags);

740

spin_lock_irqsave(&mctz->lock, flags);

741

__mem_cgroup_remove_exceeded(mz, mctz);

741

__mem_cgroup_remove_exceeded(mz, mctz);

742

spin_unlock_irqrestore(&mctz->lock, flags);

742

spin_unlock_irqrestore(&mctz->lock, flags);

743

}

743

}

744

745

static unsigned long soft_limit_excess(struct mem_cgroup *memcg)

745

static unsigned long soft_limit_excess(struct mem_cgroup *memcg)

746

{

746

{

747

unsigned long nr_pages = page_counter_read(&memcg->memory);

747

unsigned long nr_pages = page_counter_read(&memcg->memory);

748

unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);

748

unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);

749

unsigned long excess = 0;

749

unsigned long excess = 0;

750

751

if (nr_pages > soft_limit)

751

if (nr_pages > soft_limit)

752

excess = nr_pages - soft_limit;

752

excess = nr_pages - soft_limit;

753

754

return excess;

754

return excess;

755

}

755

}

756

757

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

757

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)

758

{

758

{

759

unsigned long excess;

759

unsigned long excess;

760

struct mem_cgroup_per_zone *mz;

760

struct mem_cgroup_per_zone *mz;

761

struct mem_cgroup_tree_per_zone *mctz;

761

struct mem_cgroup_tree_per_zone *mctz;

762

763

mctz = soft_limit_tree_from_page(page);

763

mctz = soft_limit_tree_from_page(page);

764

/*

764

/*

765

* Necessary to update all ancestors when hierarchy is used.

765

* Necessary to update all ancestors when hierarchy is used.

766

* because their event counter is not touched.

766

* because their event counter is not touched.

767

*/

767

*/

768

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

768

for (; memcg; memcg = parent_mem_cgroup(memcg)) {

769

mz = mem_cgroup_page_zoneinfo(memcg, page);

769

mz = mem_cgroup_page_zoneinfo(memcg, page);

770

excess = soft_limit_excess(memcg);

770

excess = soft_limit_excess(memcg);

771

/*

771

/*

772

* We have to update the tree if mz is on RB-tree or

772

* We have to update the tree if mz is on RB-tree or

773

* mem is over its softlimit.

773

* mem is over its softlimit.

774

*/

774

*/

775

if (excess || mz->on_tree) {

775

if (excess || mz->on_tree) {

776

unsigned long flags;

776

unsigned long flags;

777

778

spin_lock_irqsave(&mctz->lock, flags);

778

spin_lock_irqsave(&mctz->lock, flags);

779

/* if on-tree, remove it */

779

/* if on-tree, remove it */

780

if (mz->on_tree)

780

if (mz->on_tree)

781

__mem_cgroup_remove_exceeded(mz, mctz);

781

__mem_cgroup_remove_exceeded(mz, mctz);

782

/*

782

/*

783

* Insert again. mz->usage_in_excess will be updated.

783

* Insert again. mz->usage_in_excess will be updated.

784

* If excess is 0, no tree ops.

784

* If excess is 0, no tree ops.

785

*/

785

*/

786

__mem_cgroup_insert_exceeded(mz, mctz, excess);

786

__mem_cgroup_insert_exceeded(mz, mctz, excess);

787

spin_unlock_irqrestore(&mctz->lock, flags);

787

spin_unlock_irqrestore(&mctz->lock, flags);

788

}

788

}

789

}

789

}

790

}

790

}

791

792

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

792

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)

793

{

793

{

794

struct mem_cgroup_tree_per_zone *mctz;

794

struct mem_cgroup_tree_per_zone *mctz;

795

struct mem_cgroup_per_zone *mz;

795

struct mem_cgroup_per_zone *mz;

796

int nid, zid;

796

int nid, zid;

797

798

for_each_node(nid) {

798

for_each_node(nid) {

799

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

799

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

800

mz = &memcg->nodeinfo[nid]->zoneinfo[zid];

800

mz = &memcg->nodeinfo[nid]->zoneinfo[zid];

801

mctz = soft_limit_tree_node_zone(nid, zid);

801

mctz = soft_limit_tree_node_zone(nid, zid);

802

mem_cgroup_remove_exceeded(mz, mctz);

802

mem_cgroup_remove_exceeded(mz, mctz);

803

}

803

}

804

}

804

}

805

}

805

}

806

807

static struct mem_cgroup_per_zone *

807

static struct mem_cgroup_per_zone *

808

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

808

__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

809

{

809

{

810

struct rb_node *rightmost = NULL;

810

struct rb_node *rightmost = NULL;

811

struct mem_cgroup_per_zone *mz;

811

struct mem_cgroup_per_zone *mz;

812

813

retry:

813

retry:

814

mz = NULL;

814

mz = NULL;

815

rightmost = rb_last(&mctz->rb_root);

815

rightmost = rb_last(&mctz->rb_root);

816

if (!rightmost)

816

if (!rightmost)

817

goto done; /* Nothing to reclaim from */

817

goto done; /* Nothing to reclaim from */

818

819

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

819

mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);

820

/*

820

/*

821

* Remove the node now but someone else can add it back,

821

* Remove the node now but someone else can add it back,

822

* we will to add it back at the end of reclaim to its correct

822

* we will to add it back at the end of reclaim to its correct

823

* position in the tree.

823

* position in the tree.

824

*/

824

*/

825

__mem_cgroup_remove_exceeded(mz, mctz);

825

__mem_cgroup_remove_exceeded(mz, mctz);

826

if (!soft_limit_excess(mz->memcg) ||

826

if (!soft_limit_excess(mz->memcg) ||

827

!css_tryget_online(&mz->memcg->css))

827

!css_tryget_online(&mz->memcg->css))

828

goto retry;

828

goto retry;

829

done:

829

done:

830

return mz;

830

return mz;

831

}

831

}

832

833

static struct mem_cgroup_per_zone *

833

static struct mem_cgroup_per_zone *

834

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

834

mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)

835

{

835

{

836

struct mem_cgroup_per_zone *mz;

836

struct mem_cgroup_per_zone *mz;

837

838

spin_lock_irq(&mctz->lock);

838

spin_lock_irq(&mctz->lock);

839

mz = __mem_cgroup_largest_soft_limit_node(mctz);

839

mz = __mem_cgroup_largest_soft_limit_node(mctz);

840

spin_unlock_irq(&mctz->lock);

840

spin_unlock_irq(&mctz->lock);

841

return mz;

841

return mz;

842

}

842

}

843

844

/*

844

/*

845

* Implementation Note: reading percpu statistics for memcg.

845

* Implementation Note: reading percpu statistics for memcg.

846

*

846

*

847

* Both of vmstat[] and percpu_counter has threshold and do periodic

847

* Both of vmstat[] and percpu_counter has threshold and do periodic

848

* synchronization to implement "quick" read. There are trade-off between

848

* synchronization to implement "quick" read. There are trade-off between

849

* reading cost and precision of value. Then, we may have a chance to implement

849

* reading cost and precision of value. Then, we may have a chance to implement

850

* a periodic synchronizion of counter in memcg's counter.

850

* a periodic synchronizion of counter in memcg's counter.

851

*

851

*

852

* But this _read() function is used for user interface now. The user accounts

852

* But this _read() function is used for user interface now. The user accounts

853

* memory usage by memory cgroup and he _always_ requires exact value because

853

* memory usage by memory cgroup and he _always_ requires exact value because

854

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

854

* he accounts memory. Even if we provide quick-and-fuzzy read, we always

855

* have to visit all online cpus and make sum. So, for now, unnecessary

855

* have to visit all online cpus and make sum. So, for now, unnecessary

856

* synchronization is not implemented. (just implemented for cpu hotplug)

856

* synchronization is not implemented. (just implemented for cpu hotplug)

857

*

857

*

858

* If there are kernel internal actions which can make use of some not-exact

858

* If there are kernel internal actions which can make use of some not-exact

859

* value, and reading all cpu value can be performance bottleneck in some

859

* value, and reading all cpu value can be performance bottleneck in some

860

* common workload, threashold and synchonization as vmstat[] should be

860

* common workload, threashold and synchonization as vmstat[] should be

861

* implemented.

861

* implemented.

862

*/

862

*/

863

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

863

static long mem_cgroup_read_stat(struct mem_cgroup *memcg,

864

enum mem_cgroup_stat_index idx)

864

enum mem_cgroup_stat_index idx)

865

{

865

{

866

long val = 0;

866

long val = 0;

867

int cpu;

867

int cpu;

868

869

get_online_cpus();

869

get_online_cpus();

870

for_each_online_cpu(cpu)

870

for_each_online_cpu(cpu)

871

val += per_cpu(memcg->stat->count[idx], cpu);

871

val += per_cpu(memcg->stat->count[idx], cpu);

872

#ifdef CONFIG_HOTPLUG_CPU

872

#ifdef CONFIG_HOTPLUG_CPU

873

spin_lock(&memcg->pcp_counter_lock);

873

spin_lock(&memcg->pcp_counter_lock);

874

val += memcg->nocpu_base.count[idx];

874

val += memcg->nocpu_base.count[idx];

875

spin_unlock(&memcg->pcp_counter_lock);

875

spin_unlock(&memcg->pcp_counter_lock);

876

#endif

876

#endif

877

put_online_cpus();

877

put_online_cpus();

878

return val;

878

return val;

879

}

879

}

880

881

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

881

static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,

882

enum mem_cgroup_events_index idx)

882

enum mem_cgroup_events_index idx)

883

{

883

{

884

unsigned long val = 0;

884

unsigned long val = 0;

885

int cpu;

885

int cpu;

886

887

get_online_cpus();

887

get_online_cpus();

888

for_each_online_cpu(cpu)

888

for_each_online_cpu(cpu)

889

val += per_cpu(memcg->stat->events[idx], cpu);

889

val += per_cpu(memcg->stat->events[idx], cpu);

890

#ifdef CONFIG_HOTPLUG_CPU

890

#ifdef CONFIG_HOTPLUG_CPU

891

spin_lock(&memcg->pcp_counter_lock);

891

spin_lock(&memcg->pcp_counter_lock);

892

val += memcg->nocpu_base.events[idx];

892

val += memcg->nocpu_base.events[idx];

893

spin_unlock(&memcg->pcp_counter_lock);

893

spin_unlock(&memcg->pcp_counter_lock);

894

#endif

894

#endif

895

put_online_cpus();

895

put_online_cpus();

896

return val;

896

return val;

897

}

897

}

898

899

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

899

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,

900

struct page *page,

900

struct page *page,

901

int nr_pages)

901

int nr_pages)

902

{

902

{

903

/*

903

/*

904

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

904

* Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is

905

* counted as CACHE even if it's on ANON LRU.

905

* counted as CACHE even if it's on ANON LRU.

906

*/

906

*/

907

if (PageAnon(page))

907

if (PageAnon(page))

908

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

908

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],

909

nr_pages);

909

nr_pages);

910

else

910

else

911

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

911

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],

912

nr_pages);

912

nr_pages);

913

914

if (PageTransHuge(page))

914

if (PageTransHuge(page))

915

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

915

__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

916

nr_pages);

916

nr_pages);

917

918

/* pagein of a big page is an event. So, ignore page size */

918

/* pagein of a big page is an event. So, ignore page size */

919

if (nr_pages > 0)

919

if (nr_pages > 0)

920

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

920

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);

921

else {

921

else {

922

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

922

__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);

923

nr_pages = -nr_pages; /* for event */

923

nr_pages = -nr_pages; /* for event */

924

}

924

}

925

926

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

926

__this_cpu_add(memcg->stat->nr_page_events, nr_pages);

927

}

927

}

928

929

unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

929

unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)

930

{

930

{

931

struct mem_cgroup_per_zone *mz;

931

struct mem_cgroup_per_zone *mz;

932

933

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

933

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

934

return mz->lru_size[lru];

934

return mz->lru_size[lru];

935

}

935

}

936

937

static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

937

static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,

938

int nid,

938

int nid,

939

unsigned int lru_mask)

939

unsigned int lru_mask)

940

{

940

{

941

unsigned long nr = 0;

941

unsigned long nr = 0;

942

int zid;

942

int zid;

943

944

VM_BUG_ON((unsigned)nid >= nr_node_ids);

944

VM_BUG_ON((unsigned)nid >= nr_node_ids);

945

946

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

946

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

947

struct mem_cgroup_per_zone *mz;

947

struct mem_cgroup_per_zone *mz;

948

enum lru_list lru;

948

enum lru_list lru;

949

950

for_each_lru(lru) {

950

for_each_lru(lru) {

951

if (!(BIT(lru) & lru_mask))

951

if (!(BIT(lru) & lru_mask))

952

continue;

952

continue;

953

mz = &memcg->nodeinfo[nid]->zoneinfo[zid];

953

mz = &memcg->nodeinfo[nid]->zoneinfo[zid];

954

nr += mz->lru_size[lru];

954

nr += mz->lru_size[lru];

955

}

955

}

956

}

956

}

957

return nr;

957

return nr;

958

}

958

}

959

960

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

960

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,

961

unsigned int lru_mask)

961

unsigned int lru_mask)

962

{

962

{

963

unsigned long nr = 0;

963

unsigned long nr = 0;

964

int nid;

964

int nid;

965

966

for_each_node_state(nid, N_MEMORY)

966

for_each_node_state(nid, N_MEMORY)

967

nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

967

nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);

968

return nr;

968

return nr;

969

}

969

}

970

971

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

971

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,

972

enum mem_cgroup_events_target target)

972

enum mem_cgroup_events_target target)

973

{

973

{

974

unsigned long val, next;

974

unsigned long val, next;

975

976

val = __this_cpu_read(memcg->stat->nr_page_events);

976

val = __this_cpu_read(memcg->stat->nr_page_events);

977

next = __this_cpu_read(memcg->stat->targets[target]);

977

next = __this_cpu_read(memcg->stat->targets[target]);

978

/* from time_after() in jiffies.h */

978

/* from time_after() in jiffies.h */

979

if ((long)next - (long)val < 0) {

979

if ((long)next - (long)val < 0) {

980

switch (target) {

980

switch (target) {

981

case MEM_CGROUP_TARGET_THRESH:

981

case MEM_CGROUP_TARGET_THRESH:

982

next = val + THRESHOLDS_EVENTS_TARGET;

982

next = val + THRESHOLDS_EVENTS_TARGET;

983

break;

983

break;

984

case MEM_CGROUP_TARGET_SOFTLIMIT:

984

case MEM_CGROUP_TARGET_SOFTLIMIT:

985

next = val + SOFTLIMIT_EVENTS_TARGET;

985

next = val + SOFTLIMIT_EVENTS_TARGET;

986

break;

986

break;

987

case MEM_CGROUP_TARGET_NUMAINFO:

987

case MEM_CGROUP_TARGET_NUMAINFO:

988

next = val + NUMAINFO_EVENTS_TARGET;

988

next = val + NUMAINFO_EVENTS_TARGET;

989

break;

989

break;

990

default:

990

default:

991

break;

991

break;

992

}

992

}

993

__this_cpu_write(memcg->stat->targets[target], next);

993

__this_cpu_write(memcg->stat->targets[target], next);

994

return true;

994

return true;

995

}

995

}

996

return false;

996

return false;

997

}

997

}

998

999

/*

999

/*

1000

* Check events in order.

1000

* Check events in order.

1001

*

1001

*

1002

*/

1002

*/

1003

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

1003

static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)

1004

{

1004

{

1005

/* threshold event is triggered in finer grain than soft limit */

1005

/* threshold event is triggered in finer grain than soft limit */

1006

if (unlikely(mem_cgroup_event_ratelimit(memcg,

1006

if (unlikely(mem_cgroup_event_ratelimit(memcg,

1007

MEM_CGROUP_TARGET_THRESH))) {

1007

MEM_CGROUP_TARGET_THRESH))) {

1008

bool do_softlimit;

1008

bool do_softlimit;

1009

bool do_numainfo __maybe_unused;

1009

bool do_numainfo __maybe_unused;

1010

1011

do_softlimit = mem_cgroup_event_ratelimit(memcg,

1011

do_softlimit = mem_cgroup_event_ratelimit(memcg,

1012

MEM_CGROUP_TARGET_SOFTLIMIT);

1012

MEM_CGROUP_TARGET_SOFTLIMIT);

1013

#if MAX_NUMNODES > 1

1013

#if MAX_NUMNODES > 1

1014

do_numainfo = mem_cgroup_event_ratelimit(memcg,

1014

do_numainfo = mem_cgroup_event_ratelimit(memcg,

1015

MEM_CGROUP_TARGET_NUMAINFO);

1015

MEM_CGROUP_TARGET_NUMAINFO);

1016

#endif

1016

#endif

1017

mem_cgroup_threshold(memcg);

1017

mem_cgroup_threshold(memcg);

1018

if (unlikely(do_softlimit))

1018

if (unlikely(do_softlimit))

1019

mem_cgroup_update_tree(memcg, page);

1019

mem_cgroup_update_tree(memcg, page);

1020

#if MAX_NUMNODES > 1

1020

#if MAX_NUMNODES > 1

1021

if (unlikely(do_numainfo))

1021

if (unlikely(do_numainfo))

1022

atomic_inc(&memcg->numainfo_events);

1022

atomic_inc(&memcg->numainfo_events);

1023

#endif

1023

#endif

1024

}

1024

}

1025

}

1025

}

1026

1027

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

1027

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)

1028

{

1028

{

1029

/*

1029

/*

1030

* mm_update_next_owner() may clear mm->owner to NULL

1030

* mm_update_next_owner() may clear mm->owner to NULL

1031

* if it races with swapoff, page migration, etc.

1031

* if it races with swapoff, page migration, etc.

1032

* So this can be called with p == NULL.

1032

* So this can be called with p == NULL.

1033

*/

1033

*/

1034

if (unlikely(!p))

1034

if (unlikely(!p))

1035

return NULL;

1035

return NULL;

1036

1037

return mem_cgroup_from_css(task_css(p, memory_cgrp_id));

1037

return mem_cgroup_from_css(task_css(p, memory_cgrp_id));

1038

}

1038

}

1039

1040

static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)

1040

static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)

1041

{

1041

{

1042

struct mem_cgroup *memcg = NULL;

1042

struct mem_cgroup *memcg = NULL;

1043

1044

rcu_read_lock();

1044

rcu_read_lock();

1045

do {

1045

do {

1046

/*

1046

/*

1047

* Page cache insertions can happen withou an

1047

* Page cache insertions can happen withou an

1048

* actual mm context, e.g. during disk probing

1048

* actual mm context, e.g. during disk probing

1049

* on boot, loopback IO, acct() writes etc.

1049

* on boot, loopback IO, acct() writes etc.

1050

*/

1050

*/

1051

if (unlikely(!mm))

1051

if (unlikely(!mm))

1052

memcg = root_mem_cgroup;

1052

memcg = root_mem_cgroup;

1053

else {

1053

else {

1054

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1054

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1055

if (unlikely(!memcg))

1055

if (unlikely(!memcg))

1056

memcg = root_mem_cgroup;

1056

memcg = root_mem_cgroup;

1057

}

1057

}

1058

} while (!css_tryget_online(&memcg->css));

1058

} while (!css_tryget_online(&memcg->css));

1059

rcu_read_unlock();

1059

rcu_read_unlock();

1060

return memcg;

1060

return memcg;

1061

}

1061

}

1062

1063

/**

1063

/**

1064

* mem_cgroup_iter - iterate over memory cgroup hierarchy

1064

* mem_cgroup_iter - iterate over memory cgroup hierarchy

1065

* @root: hierarchy root

1065

* @root: hierarchy root

1066

* @prev: previously returned memcg, NULL on first invocation

1066

* @prev: previously returned memcg, NULL on first invocation

1067

* @reclaim: cookie for shared reclaim walks, NULL for full walks

1067

* @reclaim: cookie for shared reclaim walks, NULL for full walks

1068

*

1068

*

1069

* Returns references to children of the hierarchy below @root, or

1069

* Returns references to children of the hierarchy below @root, or

1070

* @root itself, or %NULL after a full round-trip.

1070

* @root itself, or %NULL after a full round-trip.

1071

*

1071

*

1072

* Caller must pass the return value in @prev on subsequent

1072

* Caller must pass the return value in @prev on subsequent

1073

* invocations for reference counting, or use mem_cgroup_iter_break()

1073

* invocations for reference counting, or use mem_cgroup_iter_break()

1074

* to cancel a hierarchy walk before the round-trip is complete.

1074

* to cancel a hierarchy walk before the round-trip is complete.

1075

*

1075

*

1076

* Reclaimers can specify a zone and a priority level in @reclaim to

1076

* Reclaimers can specify a zone and a priority level in @reclaim to

1077

* divide up the memcgs in the hierarchy among all concurrent

1077

* divide up the memcgs in the hierarchy among all concurrent

1078

* reclaimers operating on the same zone and priority.

1078

* reclaimers operating on the same zone and priority.

1079

*/

1079

*/

1080

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

1080

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,

1081

struct mem_cgroup *prev,

1081

struct mem_cgroup *prev,

1082

struct mem_cgroup_reclaim_cookie *reclaim)

1082

struct mem_cgroup_reclaim_cookie *reclaim)

1083

{

1083

{

1084

struct reclaim_iter *uninitialized_var(iter);

1084

struct reclaim_iter *uninitialized_var(iter);

1085

struct cgroup_subsys_state *css = NULL;

1085

struct cgroup_subsys_state *css = NULL;

1086

struct mem_cgroup *memcg = NULL;

1086

struct mem_cgroup *memcg = NULL;

1087

struct mem_cgroup *pos = NULL;

1087

struct mem_cgroup *pos = NULL;

1088

1089

if (mem_cgroup_disabled())

1089

if (mem_cgroup_disabled())

1090

return NULL;

1090

return NULL;

1091

1092

if (!root)

1092

if (!root)

1093

root = root_mem_cgroup;

1093

root = root_mem_cgroup;

1094

1095

if (prev && !reclaim)

1095

if (prev && !reclaim)

1096

pos = prev;

1096

pos = prev;

1097

1098

if (!root->use_hierarchy && root != root_mem_cgroup) {

1098

if (!root->use_hierarchy && root != root_mem_cgroup) {

1099

if (prev)

1099

if (prev)

1100

goto out;

1100

goto out;

1101

return root;

1101

return root;

1102

}

1102

}

1103

1104

rcu_read_lock();

1104

rcu_read_lock();

1105

1106

if (reclaim) {

1106

if (reclaim) {

1107

struct mem_cgroup_per_zone *mz;

1107

struct mem_cgroup_per_zone *mz;

1108

1109

mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);

1109

mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);

1110

iter = &mz->iter[reclaim->priority];

1110

iter = &mz->iter[reclaim->priority];

1111

1112

if (prev && reclaim->generation != iter->generation)

1112

if (prev && reclaim->generation != iter->generation)

1113

goto out_unlock;

1113

goto out_unlock;

1114

1115

do {

1115

do {

1116

pos = ACCESS_ONCE(iter->position);

1116

pos = ACCESS_ONCE(iter->position);

1117

/*

1117

/*

1118

* A racing update may change the position and

1118

* A racing update may change the position and

1119

* put the last reference, hence css_tryget(),

1119

* put the last reference, hence css_tryget(),

1120

* or retry to see the updated position.

1120

* or retry to see the updated position.

1121

*/

1121

*/

1122

} while (pos && !css_tryget(&pos->css));

1122

} while (pos && !css_tryget(&pos->css));

1123

}

1123

}

1124

1125

if (pos)

1125

if (pos)

1126

css = &pos->css;

1126

css = &pos->css;

1127

1128

for (;;) {

1128

for (;;) {

1129

css = css_next_descendant_pre(css, &root->css);

1129

css = css_next_descendant_pre(css, &root->css);

1130

if (!css) {

1130

if (!css) {

1131

/*

1131

/*

1132

* Reclaimers share the hierarchy walk, and a

1132

* Reclaimers share the hierarchy walk, and a

1133

* new one might jump in right at the end of

1133

* new one might jump in right at the end of

1134

* the hierarchy - make sure they see at least

1134

* the hierarchy - make sure they see at least

1135

* one group and restart from the beginning.

1135

* one group and restart from the beginning.

1136

*/

1136

*/

1137

if (!prev)

1137

if (!prev)

1138

continue;

1138

continue;

1139

break;

1139

break;

1140

}

1140

}

1141

1142

/*

1142

/*

1143

* Verify the css and acquire a reference. The root

1143

* Verify the css and acquire a reference. The root

1144

* is provided by the caller, so we know it's alive

1144

* is provided by the caller, so we know it's alive

1145

* and kicking, and don't take an extra reference.

1145

* and kicking, and don't take an extra reference.

1146

*/

1146

*/

1147

memcg = mem_cgroup_from_css(css);

1147

memcg = mem_cgroup_from_css(css);

1148

1149

if (css == &root->css)

1149

if (css == &root->css)

1150

break;

1150

break;

1151

1152

if (css_tryget_online(css)) {

1152

if (css_tryget_online(css)) {

1153

/*

1153

/*

1154

* Make sure the memcg is initialized:

1154

* Make sure the memcg is initialized:

1155

* mem_cgroup_css_online() orders the the

1155

* mem_cgroup_css_online() orders the the

1156

* initialization against setting the flag.

1156

* initialization against setting the flag.

1157

*/

1157

*/

1158

if (smp_load_acquire(&memcg->initialized))

1158

if (smp_load_acquire(&memcg->initialized))

1159

break;

1159

break;

1160

1161

css_put(css);

1161

css_put(css);

1162

}

1162

}

1163

1164

memcg = NULL;

1164

memcg = NULL;

1165

}

1165

}

1166

1167

if (reclaim) {

1167

if (reclaim) {

1168

if (cmpxchg(&iter->position, pos, memcg) == pos) {

1168

if (cmpxchg(&iter->position, pos, memcg) == pos) {

1169

if (memcg)

1169

if (memcg)

1170

css_get(&memcg->css);

1170

css_get(&memcg->css);

1171

if (pos)

1171

if (pos)

1172

css_put(&pos->css);

1172

css_put(&pos->css);

1173

}

1173

}

1174

1175

/*

1175

/*

1176

* pairs with css_tryget when dereferencing iter->position

1176

* pairs with css_tryget when dereferencing iter->position

1177

* above.

1177

* above.

1178

*/

1178

*/

1179

if (pos)

1179

if (pos)

1180

css_put(&pos->css);

1180

css_put(&pos->css);

1181

1182

if (!memcg)

1182

if (!memcg)

1183

iter->generation++;

1183

iter->generation++;

1184

else if (!prev)

1184

else if (!prev)

1185

reclaim->generation = iter->generation;

1185

reclaim->generation = iter->generation;

1186

}

1186

}

1187

1188

out_unlock:

1188

out_unlock:

1189

rcu_read_unlock();

1189

rcu_read_unlock();

1190

out:

1190

out:

1191

if (prev && prev != root)

1191

if (prev && prev != root)

1192

css_put(&prev->css);

1192

css_put(&prev->css);

1193

1194

return memcg;

1194

return memcg;

1195

}

1195

}

1196

1197

/**

1197

/**

1198

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

1198

* mem_cgroup_iter_break - abort a hierarchy walk prematurely

1199

* @root: hierarchy root

1199

* @root: hierarchy root

1200

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

1200

* @prev: last visited hierarchy member as returned by mem_cgroup_iter()

1201

*/

1201

*/

1202

void mem_cgroup_iter_break(struct mem_cgroup *root,

1202

void mem_cgroup_iter_break(struct mem_cgroup *root,

1203

struct mem_cgroup *prev)

1203

struct mem_cgroup *prev)

1204

{

1204

{

1205

if (!root)

1205

if (!root)

1206

root = root_mem_cgroup;

1206

root = root_mem_cgroup;

1207

if (prev && prev != root)

1207

if (prev && prev != root)

1208

css_put(&prev->css);

1208

css_put(&prev->css);

1209

}

1209

}

1210

1211

/*

1211

/*

1212

* Iteration constructs for visiting all cgroups (under a tree). If

1212

* Iteration constructs for visiting all cgroups (under a tree). If

1213

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1213

* loops are exited prematurely (break), mem_cgroup_iter_break() must

1214

* be used for reference counting.

1214

* be used for reference counting.

1215

*/

1215

*/

1216

#define for_each_mem_cgroup_tree(iter, root) \

1216

#define for_each_mem_cgroup_tree(iter, root) \

1217

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1217

for (iter = mem_cgroup_iter(root, NULL, NULL); \

1218

iter != NULL; \

1218

iter != NULL; \

1219

iter = mem_cgroup_iter(root, iter, NULL))

1219

iter = mem_cgroup_iter(root, iter, NULL))

1220

1221

#define for_each_mem_cgroup(iter) \

1221

#define for_each_mem_cgroup(iter) \

1222

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1222

for (iter = mem_cgroup_iter(NULL, NULL, NULL); \

1223

iter != NULL; \

1223

iter != NULL; \

1224

iter = mem_cgroup_iter(NULL, iter, NULL))

1224

iter = mem_cgroup_iter(NULL, iter, NULL))

1225

1226

void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1226

void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)

1227

{

1227

{

1228

struct mem_cgroup *memcg;

1228

struct mem_cgroup *memcg;

1229

1230

rcu_read_lock();

1230

rcu_read_lock();

1231

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1231

memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));

1232

if (unlikely(!memcg))

1232

if (unlikely(!memcg))

1233

goto out;

1233

goto out;

1234

1235

switch (idx) {

1235

switch (idx) {

1236

case PGFAULT:

1236

case PGFAULT:

1237

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1237

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);

1238

break;

1238

break;

1239

case PGMAJFAULT:

1239

case PGMAJFAULT:

1240

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1240

this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);

1241

break;

1241

break;

1242

default:

1242

default:

1243

BUG();

1243

BUG();

1244

}

1244

}

1245

out:

1245

out:

1246

rcu_read_unlock();

1246

rcu_read_unlock();

1247

}

1247

}

1248

EXPORT_SYMBOL(__mem_cgroup_count_vm_event);

1248

EXPORT_SYMBOL(__mem_cgroup_count_vm_event);

1249

1250

/**

1250

/**

1251

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1251

* mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg

1252

* @zone: zone of the wanted lruvec

1252

* @zone: zone of the wanted lruvec

1253

* @memcg: memcg of the wanted lruvec

1253

* @memcg: memcg of the wanted lruvec

1254

*

1254

*

1255

* Returns the lru list vector holding pages for the given @zone and

1255

* Returns the lru list vector holding pages for the given @zone and

1256

* @mem. This can be the global zone lruvec, if the memory controller

1256

* @mem. This can be the global zone lruvec, if the memory controller

1257

* is disabled.

1257

* is disabled.

1258

*/

1258

*/

1259

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1259

struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,

1260

struct mem_cgroup *memcg)

1260

struct mem_cgroup *memcg)

1261

{

1261

{

1262

struct mem_cgroup_per_zone *mz;

1262

struct mem_cgroup_per_zone *mz;

1263

struct lruvec *lruvec;

1263

struct lruvec *lruvec;

1264

1265

if (mem_cgroup_disabled()) {

1265

if (mem_cgroup_disabled()) {

1266

lruvec = &zone->lruvec;

1266

lruvec = &zone->lruvec;

1267

goto out;

1267

goto out;

1268

}

1268

}

1269

1270

mz = mem_cgroup_zone_zoneinfo(memcg, zone);

1270

mz = mem_cgroup_zone_zoneinfo(memcg, zone);

1271

lruvec = &mz->lruvec;

1271

lruvec = &mz->lruvec;

1272

out:

1272

out:

1273

/*

1273

/*

1274

* Since a node can be onlined after the mem_cgroup was created,

1274

* Since a node can be onlined after the mem_cgroup was created,

1275

* we have to be prepared to initialize lruvec->zone here;

1275

* we have to be prepared to initialize lruvec->zone here;

1276

* and if offlined then reonlined, we need to reinitialize it.

1276

* and if offlined then reonlined, we need to reinitialize it.

1277

*/

1277

*/

1278

if (unlikely(lruvec->zone != zone))

1278

if (unlikely(lruvec->zone != zone))

1279

lruvec->zone = zone;

1279

lruvec->zone = zone;

1280

return lruvec;

1280

return lruvec;

1281

}

1281

}

1282

1283

/**

1283

/**

1284

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1284

* mem_cgroup_page_lruvec - return lruvec for adding an lru page

1285

* @page: the page

1285

* @page: the page

1286

* @zone: zone of the page

1286

* @zone: zone of the page

1287

*/

1287

*/

1288

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1288

struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)

1289

{

1289

{

1290

struct mem_cgroup_per_zone *mz;

1290

struct mem_cgroup_per_zone *mz;

1291

struct mem_cgroup *memcg;

1291

struct mem_cgroup *memcg;

1292

struct page_cgroup *pc;

1292

struct page_cgroup *pc;

1293

struct lruvec *lruvec;

1293

struct lruvec *lruvec;

1294

1295

if (mem_cgroup_disabled()) {

1295

if (mem_cgroup_disabled()) {

1296

lruvec = &zone->lruvec;

1296

lruvec = &zone->lruvec;

1297

goto out;

1297

goto out;

1298

}

1298

}

1299

1300

pc = lookup_page_cgroup(page);

1300

pc = lookup_page_cgroup(page);

1301

memcg = pc->mem_cgroup;

1301

memcg = pc->mem_cgroup;

1302

1303

/*

1303

/*

1304

* Surreptitiously switch any uncharged offlist page to root:

1304

* Surreptitiously switch any uncharged offlist page to root:

1305

* an uncharged page off lru does nothing to secure

1305

* an uncharged page off lru does nothing to secure

1306

* its former mem_cgroup from sudden removal.

1306

* its former mem_cgroup from sudden removal.

1307

*

1307

*

1308

* Our caller holds lru_lock, and PageCgroupUsed is updated

1308

* Our caller holds lru_lock, and PageCgroupUsed is updated

1309

* under page_cgroup lock: between them, they make all uses

1309

* under page_cgroup lock: between them, they make all uses

1310

* of pc->mem_cgroup safe.

1310

* of pc->mem_cgroup safe.

1311

*/

1311

*/

1312

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1312

if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)

1313

pc->mem_cgroup = memcg = root_mem_cgroup;

1313

pc->mem_cgroup = memcg = root_mem_cgroup;

1314

1315

mz = mem_cgroup_page_zoneinfo(memcg, page);

1315

mz = mem_cgroup_page_zoneinfo(memcg, page);

1316

lruvec = &mz->lruvec;

1316

lruvec = &mz->lruvec;

1317

out:

1317

out:

1318

/*

1318

/*

1319

* Since a node can be onlined after the mem_cgroup was created,

1319

* Since a node can be onlined after the mem_cgroup was created,

1320

* we have to be prepared to initialize lruvec->zone here;

1320

* we have to be prepared to initialize lruvec->zone here;

1321

* and if offlined then reonlined, we need to reinitialize it.

1321

* and if offlined then reonlined, we need to reinitialize it.

1322

*/

1322

*/

1323

if (unlikely(lruvec->zone != zone))

1323

if (unlikely(lruvec->zone != zone))

1324

lruvec->zone = zone;

1324

lruvec->zone = zone;

1325

return lruvec;

1325

return lruvec;

1326

}

1326

}

1327

1328

/**

1328

/**

1329

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1329

* mem_cgroup_update_lru_size - account for adding or removing an lru page

1330

* @lruvec: mem_cgroup per zone lru vector

1330

* @lruvec: mem_cgroup per zone lru vector

1331

* @lru: index of lru list the page is sitting on

1331

* @lru: index of lru list the page is sitting on

1332

* @nr_pages: positive when adding or negative when removing

1332

* @nr_pages: positive when adding or negative when removing

1333

*

1333

*

1334

* This function must be called when a page is added to or removed from an

1334

* This function must be called when a page is added to or removed from an

1335

* lru list.

1335

* lru list.

1336

*/

1336

*/

1337

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1337

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,

1338

int nr_pages)

1338

int nr_pages)

1339

{

1339

{

1340

struct mem_cgroup_per_zone *mz;

1340

struct mem_cgroup_per_zone *mz;

1341

unsigned long *lru_size;

1341

unsigned long *lru_size;

1342

1343

if (mem_cgroup_disabled())

1343

if (mem_cgroup_disabled())

1344

return;

1344

return;

1345

1346

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1346

mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);

1347

lru_size = mz->lru_size + lru;

1347

lru_size = mz->lru_size + lru;

1348

*lru_size += nr_pages;

1348

*lru_size += nr_pages;

1349

VM_BUG_ON((long)(*lru_size) < 0);

1349

VM_BUG_ON((long)(*lru_size) < 0);

1350

}

1350

}

1351

1352

/*

1352

/*

1353

* Checks whether given mem is same or in the root_mem_cgroup's

1353

* Checks whether given mem is same or in the root_mem_cgroup's

1354

* hierarchy subtree

1354

* hierarchy subtree

1355

*/

1355

*/

1356

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1356

bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1357

struct mem_cgroup *memcg)

1357

struct mem_cgroup *memcg)

1358

{

1358

{

1359

if (root_memcg == memcg)

1359

if (root_memcg == memcg)

1360

return true;

1360

return true;

1361

if (!root_memcg->use_hierarchy || !memcg)

1361

if (!root_memcg->use_hierarchy || !memcg)

1362

return false;

1362

return false;

1363

return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);

1363

return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);

1364

}

1364

}

1365

1366

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1366

static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,

1367

struct mem_cgroup *memcg)

1367

struct mem_cgroup *memcg)

1368

{

1368

{

1369

bool ret;

1369

bool ret;

1370

1371

rcu_read_lock();

1371

rcu_read_lock();

1372

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1372

ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);

1373

rcu_read_unlock();

1373

rcu_read_unlock();

1374

return ret;

1374

return ret;

1375

}

1375

}

1376

1377

bool task_in_mem_cgroup(struct task_struct *task,

1377

bool task_in_mem_cgroup(struct task_struct *task,

1378

const struct mem_cgroup *memcg)

1378

const struct mem_cgroup *memcg)

1379

{

1379

{

1380

struct mem_cgroup *curr = NULL;

1380

struct mem_cgroup *curr = NULL;

1381

struct task_struct *p;

1381

struct task_struct *p;

1382

bool ret;

1382

bool ret;

1383

1384

p = find_lock_task_mm(task);

1384

p = find_lock_task_mm(task);

1385

if (p) {

1385

if (p) {

1386

curr = get_mem_cgroup_from_mm(p->mm);

1386

curr = get_mem_cgroup_from_mm(p->mm);

1387

task_unlock(p);

1387

task_unlock(p);

1388

} else {

1388

} else {

1389

/*

1389

/*

1390

* All threads may have already detached their mm's, but the oom

1390

* All threads may have already detached their mm's, but the oom

1391

* killer still needs to detect if they have already been oom

1391

* killer still needs to detect if they have already been oom

1392

* killed to prevent needlessly killing additional tasks.

1392

* killed to prevent needlessly killing additional tasks.

1393

*/

1393

*/

1394

rcu_read_lock();

1394

rcu_read_lock();

1395

curr = mem_cgroup_from_task(task);

1395

curr = mem_cgroup_from_task(task);

1396

if (curr)

1396

if (curr)

1397

css_get(&curr->css);

1397

css_get(&curr->css);

1398

rcu_read_unlock();

1398

rcu_read_unlock();

1399

}

1399

}

1400

/*

1400

/*

1401

* We should check use_hierarchy of "memcg" not "curr". Because checking

1401

* We should check use_hierarchy of "memcg" not "curr". Because checking

1402

* use_hierarchy of "curr" here make this function true if hierarchy is

1402

* use_hierarchy of "curr" here make this function true if hierarchy is

1403

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1403

* enabled in "curr" and "curr" is a child of "memcg" in *cgroup*

1404

* hierarchy(even if use_hierarchy is disabled in "memcg").

1404

* hierarchy(even if use_hierarchy is disabled in "memcg").

1405

*/

1405

*/

1406

ret = mem_cgroup_same_or_subtree(memcg, curr);

1406

ret = mem_cgroup_same_or_subtree(memcg, curr);

1407

css_put(&curr->css);

1407

css_put(&curr->css);

1408

return ret;

1408

return ret;

1409

}

1409

}

1410

1411

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1411

int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)

1412

{

1412

{

1413

unsigned long inactive_ratio;

1413

unsigned long inactive_ratio;

1414

unsigned long inactive;

1414

unsigned long inactive;

1415

unsigned long active;

1415

unsigned long active;

1416

unsigned long gb;

1416

unsigned long gb;

1417

1418

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1418

inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);

1419

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1419

active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);

1420

1421

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1421

gb = (inactive + active) >> (30 - PAGE_SHIFT);

1422

if (gb)

1422

if (gb)

1423

inactive_ratio = int_sqrt(10 * gb);

1423

inactive_ratio = int_sqrt(10 * gb);

1424

else

1424

else

1425

inactive_ratio = 1;

1425

inactive_ratio = 1;

1426

1427

return inactive * inactive_ratio < active;

1427

return inactive * inactive_ratio < active;

1428

}

1428

}

1429

1430

#define mem_cgroup_from_counter(counter, member) \

1430

#define mem_cgroup_from_counter(counter, member) \

1431

container_of(counter, struct mem_cgroup, member)

1431

container_of(counter, struct mem_cgroup, member)

1432

1433

/**

1433

/**

1434

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1434

* mem_cgroup_margin - calculate chargeable space of a memory cgroup

1435

* @memcg: the memory cgroup

1435

* @memcg: the memory cgroup

1436

*

1436

*

1437

* Returns the maximum amount of memory @mem can be charged with, in

1437

* Returns the maximum amount of memory @mem can be charged with, in

1438

* pages.

1438

* pages.

1439

*/

1439

*/

1440

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1440

static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)

1441

{

1441

{

1442

unsigned long margin = 0;

1442

unsigned long margin = 0;

1443

unsigned long count;

1443

unsigned long count;

1444

unsigned long limit;

1444

unsigned long limit;

1445

1446

count = page_counter_read(&memcg->memory);

1446

count = page_counter_read(&memcg->memory);

1447

limit = ACCESS_ONCE(memcg->memory.limit);

1447

limit = ACCESS_ONCE(memcg->memory.limit);

1448

if (count < limit)

1448

if (count < limit)

1449

margin = limit - count;

1449

margin = limit - count;

1450

1451

if (do_swap_account) {

1451

if (do_swap_account) {

1452

count = page_counter_read(&memcg->memsw);

1452

count = page_counter_read(&memcg->memsw);

1453

limit = ACCESS_ONCE(memcg->memsw.limit);

1453

limit = ACCESS_ONCE(memcg->memsw.limit);

1454

if (count <= limit)

1454

if (count <= limit)

1455

margin = min(margin, limit - count);

1455

margin = min(margin, limit - count);

1456

}

1456

}

1457

1458

return margin;

1458

return margin;

1459

}

1459

}

1460

1461

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1461

int mem_cgroup_swappiness(struct mem_cgroup *memcg)

1462

{

1462

{

1463

/* root ? */

1463

/* root ? */

1464

if (mem_cgroup_disabled() || !memcg->css.parent)

1464

if (mem_cgroup_disabled() || !memcg->css.parent)

1465

return vm_swappiness;

1465

return vm_swappiness;

1466

1467

return memcg->swappiness;

1467

return memcg->swappiness;

1468

}

1468

}

1469

1470

/*

1470

/*

1471

* memcg->moving_account is used for checking possibility that some thread is

1471

* memcg->moving_account is used for checking possibility that some thread is

1472

* calling move_account(). When a thread on CPU-A starts moving pages under

1472

* calling move_account(). When a thread on CPU-A starts moving pages under

1473

* a memcg, other threads should check memcg->moving_account under

1473

* a memcg, other threads should check memcg->moving_account under

1474

* rcu_read_lock(), like this:

1474

* rcu_read_lock(), like this:

1475

*

1475

*

1476

* CPU-A CPU-B

1476

* CPU-A CPU-B

1477

* rcu_read_lock()

1477

* rcu_read_lock()

1478

* memcg->moving_account+1 if (memcg->mocing_account)

1478

* memcg->moving_account+1 if (memcg->mocing_account)

1479

* take heavy locks.

1479

* take heavy locks.

1480

* synchronize_rcu() update something.

1480

* synchronize_rcu() update something.

1481

* rcu_read_unlock()

1481

* rcu_read_unlock()

1482

* start move here.

1482

* start move here.

1483

*/

1483

*/

1484

1485

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1485

static void mem_cgroup_start_move(struct mem_cgroup *memcg)

1486

{

1486

{

1487

atomic_inc(&memcg->moving_account);

1487

atomic_inc(&memcg->moving_account);

1488

synchronize_rcu();

1488

synchronize_rcu();

1489

}

1489

}

1490

1491

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1491

static void mem_cgroup_end_move(struct mem_cgroup *memcg)

1492

{

1492

{

1493

/*

1493

/*

1494

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1494

* Now, mem_cgroup_clear_mc() may call this function with NULL.

1495

* We check NULL in callee rather than caller.

1495

* We check NULL in callee rather than caller.

1496

*/

1496

*/

1497

if (memcg)

1497

if (memcg)

1498

atomic_dec(&memcg->moving_account);

1498

atomic_dec(&memcg->moving_account);

1499

}

1499

}

1500

1501

/*

1501

/*

1502

* A routine for checking "mem" is under move_account() or not.

1502

* A routine for checking "mem" is under move_account() or not.

1503

*

1503

*

1504

* Checking a cgroup is mc.from or mc.to or under hierarchy of

1504

* Checking a cgroup is mc.from or mc.to or under hierarchy of

1505

* moving cgroups. This is for waiting at high-memory pressure

1505

* moving cgroups. This is for waiting at high-memory pressure

1506

* caused by "move".

1506

* caused by "move".

1507

*/

1507

*/

1508

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1508

static bool mem_cgroup_under_move(struct mem_cgroup *memcg)

1509

{

1509

{

1510

struct mem_cgroup *from;

1510

struct mem_cgroup *from;

1511

struct mem_cgroup *to;

1511

struct mem_cgroup *to;

1512

bool ret = false;

1512

bool ret = false;

1513

/*

1513

/*

1514

* Unlike task_move routines, we access mc.to, mc.from not under

1514

* Unlike task_move routines, we access mc.to, mc.from not under

1515

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1515

* mutual exclusion by cgroup_mutex. Here, we take spinlock instead.

1516

*/

1516

*/

1517

spin_lock(&mc.lock);

1517

spin_lock(&mc.lock);

1518

from = mc.from;

1518

from = mc.from;

1519

to = mc.to;

1519

to = mc.to;

1520

if (!from)

1520

if (!from)

1521

goto unlock;

1521

goto unlock;

1522

1523

ret = mem_cgroup_same_or_subtree(memcg, from)

1523

ret = mem_cgroup_same_or_subtree(memcg, from)

1524

|| mem_cgroup_same_or_subtree(memcg, to);

1524

|| mem_cgroup_same_or_subtree(memcg, to);

1525

unlock:

1525

unlock:

1526

spin_unlock(&mc.lock);

1526

spin_unlock(&mc.lock);

1527

return ret;

1527

return ret;

1528

}

1528

}

1529

1530

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1530

static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)

1531

{

1531

{

1532

if (mc.moving_task && current != mc.moving_task) {

1532

if (mc.moving_task && current != mc.moving_task) {

1533

if (mem_cgroup_under_move(memcg)) {

1533

if (mem_cgroup_under_move(memcg)) {

1534

DEFINE_WAIT(wait);

1534

DEFINE_WAIT(wait);

1535

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1535

prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);

1536

/* moving charge context might have finished. */

1536

/* moving charge context might have finished. */

1537

if (mc.moving_task)

1537

if (mc.moving_task)

1538

schedule();

1538

schedule();

1539

finish_wait(&mc.waitq, &wait);

1539

finish_wait(&mc.waitq, &wait);

1540

return true;

1540

return true;

1541

}

1541

}

1542

}

1542

}

1543

return false;

1543

return false;

1544

}

1544

}

1545

1546

/*

1546

/*

1547

* Take this lock when

1547

* Take this lock when

1548

* - a code tries to modify page's memcg while it's USED.

1548

* - a code tries to modify page's memcg while it's USED.

1549

* - a code tries to modify page state accounting in a memcg.

1549

* - a code tries to modify page state accounting in a memcg.

1550

*/

1550

*/

1551

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1551

static void move_lock_mem_cgroup(struct mem_cgroup *memcg,

1552

unsigned long *flags)

1552

unsigned long *flags)

1553

{

1553

{

1554

spin_lock_irqsave(&memcg->move_lock, *flags);

1554

spin_lock_irqsave(&memcg->move_lock, *flags);

1555

}

1555

}

1556

1557

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1557

static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,

1558

unsigned long *flags)

1558

unsigned long *flags)

1559

{

1559

{

1560

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1560

spin_unlock_irqrestore(&memcg->move_lock, *flags);

1561

}

1561

}

1562

1563

#define K(x) ((x) << (PAGE_SHIFT-10))

1563

#define K(x) ((x) << (PAGE_SHIFT-10))

1564

/**

1564

/**

1565

* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.

1565

* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.

1566

* @memcg: The memory cgroup that went over limit

1566

* @memcg: The memory cgroup that went over limit

1567

* @p: Task that is going to be killed

1567

* @p: Task that is going to be killed

1568

*

1568

*

1569

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1569

* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is

1570

* enabled

1570

* enabled

1571

*/

1571

*/

1572

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1572

void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)

1573

{

1573

{

1574

/* oom_info_lock ensures that parallel ooms do not interleave */

1574

/* oom_info_lock ensures that parallel ooms do not interleave */

1575

static DEFINE_MUTEX(oom_info_lock);

1575

static DEFINE_MUTEX(oom_info_lock);

1576

struct mem_cgroup *iter;

1576

struct mem_cgroup *iter;

1577

unsigned int i;

1577

unsigned int i;

1578

1579

if (!p)

1579

if (!p)

1580

return;

1580

return;

1581

1582

mutex_lock(&oom_info_lock);

1582

mutex_lock(&oom_info_lock);

1583

rcu_read_lock();

1583

rcu_read_lock();

1584

1585

pr_info("Task in ");

1585

pr_info("Task in ");

1586

pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));

1586

pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));

1587

pr_info(" killed as a result of limit of ");

1587

pr_info(" killed as a result of limit of ");

1588

pr_cont_cgroup_path(memcg->css.cgroup);

1588

pr_cont_cgroup_path(memcg->css.cgroup);

1589

pr_info("\n");

1589

pr_info("\n");

1590

1591

rcu_read_unlock();

1591

rcu_read_unlock();

1592

1593

pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",

1593

pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",

1594

K((u64)page_counter_read(&memcg->memory)),

1594

K((u64)page_counter_read(&memcg->memory)),

1595

K((u64)memcg->memory.limit), memcg->memory.failcnt);

1595

K((u64)memcg->memory.limit), memcg->memory.failcnt);

1596

pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",

1596

pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",

1597

K((u64)page_counter_read(&memcg->memsw)),

1597

K((u64)page_counter_read(&memcg->memsw)),

1598

K((u64)memcg->memsw.limit), memcg->memsw.failcnt);

1598

K((u64)memcg->memsw.limit), memcg->memsw.failcnt);

1599

pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",

1599

pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",

1600

K((u64)page_counter_read(&memcg->kmem)),

1600

K((u64)page_counter_read(&memcg->kmem)),

1601

K((u64)memcg->kmem.limit), memcg->kmem.failcnt);

1601

K((u64)memcg->kmem.limit), memcg->kmem.failcnt);

1602

1603

for_each_mem_cgroup_tree(iter, memcg) {

1603

for_each_mem_cgroup_tree(iter, memcg) {

1604

pr_info("Memory cgroup stats for ");

1604

pr_info("Memory cgroup stats for ");

1605

pr_cont_cgroup_path(iter->css.cgroup);

1605

pr_cont_cgroup_path(iter->css.cgroup);

1606

pr_cont(":");

1606

pr_cont(":");

1607

1608

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

1608

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

1609

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

1609

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

1610

continue;

1610

continue;

1611

pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],

1611

pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],

1612

K(mem_cgroup_read_stat(iter, i)));

1612

K(mem_cgroup_read_stat(iter, i)));

1613

}

1613

}

1614

1615

for (i = 0; i < NR_LRU_LISTS; i++)

1615

for (i = 0; i < NR_LRU_LISTS; i++)

1616

pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],

1616

pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],

1617

K(mem_cgroup_nr_lru_pages(iter, BIT(i))));

1617

K(mem_cgroup_nr_lru_pages(iter, BIT(i))));

1618

1619

pr_cont("\n");

1619

pr_cont("\n");

1620

}

1620

}

1621

mutex_unlock(&oom_info_lock);

1621

mutex_unlock(&oom_info_lock);

1622

}

1622

}

1623

1624

/*

1624

/*

1625

* This function returns the number of memcg under hierarchy tree. Returns

1625

* This function returns the number of memcg under hierarchy tree. Returns

1626

* 1(self count) if no children.

1626

* 1(self count) if no children.

1627

*/

1627

*/

1628

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1628

static int mem_cgroup_count_children(struct mem_cgroup *memcg)

1629

{

1629

{

1630

int num = 0;

1630

int num = 0;

1631

struct mem_cgroup *iter;

1631

struct mem_cgroup *iter;

1632

1633

for_each_mem_cgroup_tree(iter, memcg)

1633

for_each_mem_cgroup_tree(iter, memcg)

1634

num++;

1634

num++;

1635

return num;

1635

return num;

1636

}

1636

}

1637

1638

/*

1638

/*

1639

* Return the memory (and swap, if configured) limit for a memcg.

1639

* Return the memory (and swap, if configured) limit for a memcg.

1640

*/

1640

*/

1641

static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)

1641

static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)

1642

{

1642

{

1643

unsigned long limit;

1643

unsigned long limit;

1644

1645

limit = memcg->memory.limit;

1645

limit = memcg->memory.limit;

1646

if (mem_cgroup_swappiness(memcg)) {

1646

if (mem_cgroup_swappiness(memcg)) {

1647

unsigned long memsw_limit;

1647

unsigned long memsw_limit;

1648

1649

memsw_limit = memcg->memsw.limit;

1649

memsw_limit = memcg->memsw.limit;

1650

limit = min(limit + total_swap_pages, memsw_limit);

1650

limit = min(limit + total_swap_pages, memsw_limit);

1651

}

1651

}

1652

return limit;

1652

return limit;

1653

}

1653

}

1654

1655

static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1655

static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,

1656

int order)

1656

int order)

1657

{

1657

{

1658

struct mem_cgroup *iter;

1658

struct mem_cgroup *iter;

1659

unsigned long chosen_points = 0;

1659

unsigned long chosen_points = 0;

1660

unsigned long totalpages;

1660

unsigned long totalpages;

1661

unsigned int points = 0;

1661

unsigned int points = 0;

1662

struct task_struct *chosen = NULL;

1662

struct task_struct *chosen = NULL;

1663

1664

/*

1664

/*

1665

* If current has a pending SIGKILL or is exiting, then automatically

1665

* If current has a pending SIGKILL or is exiting, then automatically

1666

* select it. The goal is to allow it to allocate so that it may

1666

* select it. The goal is to allow it to allocate so that it may

1667

* quickly exit and free its memory.

1667

* quickly exit and free its memory.

1668

*/

1668

*/

1669

if (fatal_signal_pending(current) || current->flags & PF_EXITING) {

1669

if (fatal_signal_pending(current) || current->flags & PF_EXITING) {

1670

set_thread_flag(TIF_MEMDIE);

1670

set_thread_flag(TIF_MEMDIE);

1671

return;

1671

return;

1672

}

1672

}

1673

1674

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1674

check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);

1675

totalpages = mem_cgroup_get_limit(memcg) ? : 1;

1675

totalpages = mem_cgroup_get_limit(memcg) ? : 1;

1676

for_each_mem_cgroup_tree(iter, memcg) {

1676

for_each_mem_cgroup_tree(iter, memcg) {

1677

struct css_task_iter it;

1677

struct css_task_iter it;

1678

struct task_struct *task;

1678

struct task_struct *task;

1679

1680

css_task_iter_start(&iter->css, &it);

1680

css_task_iter_start(&iter->css, &it);

1681

while ((task = css_task_iter_next(&it))) {

1681

while ((task = css_task_iter_next(&it))) {

1682

switch (oom_scan_process_thread(task, totalpages, NULL,

1682

switch (oom_scan_process_thread(task, totalpages, NULL,

1683

false)) {

1683

false)) {

1684

case OOM_SCAN_SELECT:

1684

case OOM_SCAN_SELECT:

1685

if (chosen)

1685

if (chosen)

1686

put_task_struct(chosen);

1686

put_task_struct(chosen);

1687

chosen = task;

1687

chosen = task;

1688

chosen_points = ULONG_MAX;

1688

chosen_points = ULONG_MAX;

1689

get_task_struct(chosen);

1689

get_task_struct(chosen);

1690

/* fall through */

1690

/* fall through */

1691

case OOM_SCAN_CONTINUE:

1691

case OOM_SCAN_CONTINUE:

1692

continue;

1692

continue;

1693

case OOM_SCAN_ABORT:

1693

case OOM_SCAN_ABORT:

1694

css_task_iter_end(&it);

1694

css_task_iter_end(&it);

1695

mem_cgroup_iter_break(memcg, iter);

1695

mem_cgroup_iter_break(memcg, iter);

1696

if (chosen)

1696

if (chosen)

1697

put_task_struct(chosen);

1697

put_task_struct(chosen);

1698

return;

1698

return;

1699

case OOM_SCAN_OK:

1699

case OOM_SCAN_OK:

1700

break;

1700

break;

1701

};

1701

};

1702

points = oom_badness(task, memcg, NULL, totalpages);

1702

points = oom_badness(task, memcg, NULL, totalpages);

1703

if (!points || points < chosen_points)

1703

if (!points || points < chosen_points)

1704

continue;

1704

continue;

1705

/* Prefer thread group leaders for display purposes */

1705

/* Prefer thread group leaders for display purposes */

1706

if (points == chosen_points &&

1706

if (points == chosen_points &&

1707

thread_group_leader(chosen))

1707

thread_group_leader(chosen))

1708

continue;

1708

continue;

1709

1710

if (chosen)

1710

if (chosen)

1711

put_task_struct(chosen);

1711

put_task_struct(chosen);

1712

chosen = task;

1712

chosen = task;

1713

chosen_points = points;

1713

chosen_points = points;

1714

get_task_struct(chosen);

1714

get_task_struct(chosen);

1715

}

1715

}

1716

css_task_iter_end(&it);

1716

css_task_iter_end(&it);

1717

}

1717

}

1718

1719

if (!chosen)

1719

if (!chosen)

1720

return;

1720

return;

1721

points = chosen_points * 1000 / totalpages;

1721

points = chosen_points * 1000 / totalpages;

1722

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1722

oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,

1723

NULL, "Memory cgroup out of memory");

1723

NULL, "Memory cgroup out of memory");

1724

}

1724

}

1725

1726

/**

1726

/**

1727

* test_mem_cgroup_node_reclaimable

1727

* test_mem_cgroup_node_reclaimable

1728

* @memcg: the target memcg

1728

* @memcg: the target memcg

1729

* @nid: the node ID to be checked.

1729

* @nid: the node ID to be checked.

1730

* @noswap : specify true here if the user wants flle only information.

1730

* @noswap : specify true here if the user wants flle only information.

1731

*

1731

*

1732

* This function returns whether the specified memcg contains any

1732

* This function returns whether the specified memcg contains any

1733

* reclaimable pages on a node. Returns true if there are any reclaimable

1733

* reclaimable pages on a node. Returns true if there are any reclaimable

1734

* pages in the node.

1734

* pages in the node.

1735

*/

1735

*/

1736

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1736

static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,

1737

int nid, bool noswap)

1737

int nid, bool noswap)

1738

{

1738

{

1739

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1739

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))

1740

return true;

1740

return true;

1741

if (noswap || !total_swap_pages)

1741

if (noswap || !total_swap_pages)

1742

return false;

1742

return false;

1743

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1743

if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))

1744

return true;

1744

return true;

1745

return false;

1745

return false;

1746

1747

}

1747

}

1748

#if MAX_NUMNODES > 1

1748

#if MAX_NUMNODES > 1

1749

1750

/*

1750

/*

1751

* Always updating the nodemask is not very good - even if we have an empty

1751

* Always updating the nodemask is not very good - even if we have an empty

1752

* list or the wrong list here, we can start from some node and traverse all

1752

* list or the wrong list here, we can start from some node and traverse all

1753

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1753

* nodes based on the zonelist. So update the list loosely once per 10 secs.

1754

*

1754

*

1755

*/

1755

*/

1756

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1756

static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)

1757

{

1757

{

1758

int nid;

1758

int nid;

1759

/*

1759

/*

1760

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1760

* numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET

1761

* pagein/pageout changes since the last update.

1761

* pagein/pageout changes since the last update.

1762

*/

1762

*/

1763

if (!atomic_read(&memcg->numainfo_events))

1763

if (!atomic_read(&memcg->numainfo_events))

1764

return;

1764

return;

1765

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1765

if (atomic_inc_return(&memcg->numainfo_updating) > 1)

1766

return;

1766

return;

1767

1768

/* make a nodemask where this memcg uses memory from */

1768

/* make a nodemask where this memcg uses memory from */

1769

memcg->scan_nodes = node_states[N_MEMORY];

1769

memcg->scan_nodes = node_states[N_MEMORY];

1770

1771

for_each_node_mask(nid, node_states[N_MEMORY]) {

1771

for_each_node_mask(nid, node_states[N_MEMORY]) {

1772

1773

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1773

if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))

1774

node_clear(nid, memcg->scan_nodes);

1774

node_clear(nid, memcg->scan_nodes);

1775

}

1775

}

1776

1777

atomic_set(&memcg->numainfo_events, 0);

1777

atomic_set(&memcg->numainfo_events, 0);

1778

atomic_set(&memcg->numainfo_updating, 0);

1778

atomic_set(&memcg->numainfo_updating, 0);

1779

}

1779

}

1780

1781

/*

1781

/*

1782

* Selecting a node where we start reclaim from. Because what we need is just

1782

* Selecting a node where we start reclaim from. Because what we need is just

1783

* reducing usage counter, start from anywhere is O,K. Considering

1783

* reducing usage counter, start from anywhere is O,K. Considering

1784

* memory reclaim from current node, there are pros. and cons.

1784

* memory reclaim from current node, there are pros. and cons.

1785

*

1785

*

1786

* Freeing memory from current node means freeing memory from a node which

1786

* Freeing memory from current node means freeing memory from a node which

1787

* we'll use or we've used. So, it may make LRU bad. And if several threads

1787

* we'll use or we've used. So, it may make LRU bad. And if several threads

1788

* hit limits, it will see a contention on a node. But freeing from remote

1788

* hit limits, it will see a contention on a node. But freeing from remote

1789

* node means more costs for memory reclaim because of memory latency.

1789

* node means more costs for memory reclaim because of memory latency.

1790

*

1790

*

1791

* Now, we use round-robin. Better algorithm is welcomed.

1791

* Now, we use round-robin. Better algorithm is welcomed.

1792

*/

1792

*/

1793

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1793

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1794

{

1794

{

1795

int node;

1795

int node;

1796

1797

mem_cgroup_may_update_nodemask(memcg);

1797

mem_cgroup_may_update_nodemask(memcg);

1798

node = memcg->last_scanned_node;

1798

node = memcg->last_scanned_node;

1799

1800

node = next_node(node, memcg->scan_nodes);

1800

node = next_node(node, memcg->scan_nodes);

1801

if (node == MAX_NUMNODES)

1801

if (node == MAX_NUMNODES)

1802

node = first_node(memcg->scan_nodes);

1802

node = first_node(memcg->scan_nodes);

1803

/*

1803

/*

1804

* We call this when we hit limit, not when pages are added to LRU.

1804

* We call this when we hit limit, not when pages are added to LRU.

1805

* No LRU may hold pages because all pages are UNEVICTABLE or

1805

* No LRU may hold pages because all pages are UNEVICTABLE or

1806

* memcg is too small and all pages are not on LRU. In that case,

1806

* memcg is too small and all pages are not on LRU. In that case,

1807

* we use curret node.

1807

* we use curret node.

1808

*/

1808

*/

1809

if (unlikely(node == MAX_NUMNODES))

1809

if (unlikely(node == MAX_NUMNODES))

1810

node = numa_node_id();

1810

node = numa_node_id();

1811

1812

memcg->last_scanned_node = node;

1812

memcg->last_scanned_node = node;

1813

return node;

1813

return node;

1814

}

1814

}

1815

1816

/*

1816

/*

1817

* Check all nodes whether it contains reclaimable pages or not.

1817

* Check all nodes whether it contains reclaimable pages or not.

1818

* For quick scan, we make use of scan_nodes. This will allow us to skip

1818

* For quick scan, we make use of scan_nodes. This will allow us to skip

1819

* unused nodes. But scan_nodes is lazily updated and may not cotain

1819

* unused nodes. But scan_nodes is lazily updated and may not cotain

1820

* enough new information. We need to do double check.

1820

* enough new information. We need to do double check.

1821

*/

1821

*/

1822

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1822

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1823

{

1823

{

1824

int nid;

1824

int nid;

1825

1826

/*

1826

/*

1827

* quick check...making use of scan_node.

1827

* quick check...making use of scan_node.

1828

* We can skip unused nodes.

1828

* We can skip unused nodes.

1829

*/

1829

*/

1830

if (!nodes_empty(memcg->scan_nodes)) {

1830

if (!nodes_empty(memcg->scan_nodes)) {

1831

for (nid = first_node(memcg->scan_nodes);

1831

for (nid = first_node(memcg->scan_nodes);

1832

nid < MAX_NUMNODES;

1832

nid < MAX_NUMNODES;

1833

nid = next_node(nid, memcg->scan_nodes)) {

1833

nid = next_node(nid, memcg->scan_nodes)) {

1834

1835

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1835

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1836

return true;

1836

return true;

1837

}

1837

}

1838

}

1838

}

1839

/*

1839

/*

1840

* Check rest of nodes.

1840

* Check rest of nodes.

1841

*/

1841

*/

1842

for_each_node_state(nid, N_MEMORY) {

1842

for_each_node_state(nid, N_MEMORY) {

1843

if (node_isset(nid, memcg->scan_nodes))

1843

if (node_isset(nid, memcg->scan_nodes))

1844

continue;

1844

continue;

1845

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1845

if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))

1846

return true;

1846

return true;

1847

}

1847

}

1848

return false;

1848

return false;

1849

}

1849

}

1850

1851

#else

1851

#else

1852

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1852

int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)

1853

{

1853

{

1854

return 0;

1854

return 0;

1855

}

1855

}

1856

1857

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1857

static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)

1858

{

1858

{

1859

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

1859

return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);

1860

}

1860

}

1861

#endif

1861

#endif

1862

1863

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

1863

static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,

1864

struct zone *zone,

1864

struct zone *zone,

1865

gfp_t gfp_mask,

1865

gfp_t gfp_mask,

1866

unsigned long *total_scanned)

1866

unsigned long *total_scanned)

1867

{

1867

{

1868

struct mem_cgroup *victim = NULL;

1868

struct mem_cgroup *victim = NULL;

1869

int total = 0;

1869

int total = 0;

1870

int loop = 0;

1870

int loop = 0;

1871

unsigned long excess;

1871

unsigned long excess;

1872

unsigned long nr_scanned;

1872

unsigned long nr_scanned;

1873

struct mem_cgroup_reclaim_cookie reclaim = {

1873

struct mem_cgroup_reclaim_cookie reclaim = {

1874

.zone = zone,

1874

.zone = zone,

1875

.priority = 0,

1875

.priority = 0,

1876

};

1876

};

1877

1878

excess = soft_limit_excess(root_memcg);

1878

excess = soft_limit_excess(root_memcg);

1879

1880

while (1) {

1880

while (1) {

1881

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

1881

victim = mem_cgroup_iter(root_memcg, victim, &reclaim);

1882

if (!victim) {

1882

if (!victim) {

1883

loop++;

1883

loop++;

1884

if (loop >= 2) {

1884

if (loop >= 2) {

1885

/*

1885

/*

1886

* If we have not been able to reclaim

1886

* If we have not been able to reclaim

1887

* anything, it might because there are

1887

* anything, it might because there are

1888

* no reclaimable pages under this hierarchy

1888

* no reclaimable pages under this hierarchy

1889

*/

1889

*/

1890

if (!total)

1890

if (!total)

1891

break;

1891

break;

1892

/*

1892

/*

1893

* We want to do more targeted reclaim.

1893

* We want to do more targeted reclaim.

1894

* excess >> 2 is not to excessive so as to

1894

* excess >> 2 is not to excessive so as to

1895

* reclaim too much, nor too less that we keep

1895

* reclaim too much, nor too less that we keep

1896

* coming back to reclaim from this cgroup

1896

* coming back to reclaim from this cgroup

1897

*/

1897

*/

1898

if (total >= (excess >> 2) ||

1898

if (total >= (excess >> 2) ||

1899

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

1899

(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))

1900

break;

1900

break;

1901

}

1901

}

1902

continue;

1902

continue;

1903

}

1903

}

1904

if (!mem_cgroup_reclaimable(victim, false))

1904

if (!mem_cgroup_reclaimable(victim, false))

1905

continue;

1905

continue;

1906

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

1906

total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,

1907

zone, &nr_scanned);

1907

zone, &nr_scanned);

1908

*total_scanned += nr_scanned;

1908

*total_scanned += nr_scanned;

1909

if (!soft_limit_excess(root_memcg))

1909

if (!soft_limit_excess(root_memcg))

1910

break;

1910

break;

1911

}

1911

}

1912

mem_cgroup_iter_break(root_memcg, victim);

1912

mem_cgroup_iter_break(root_memcg, victim);

1913

return total;

1913

return total;

1914

}

1914

}

1915

1916

#ifdef CONFIG_LOCKDEP

1916

#ifdef CONFIG_LOCKDEP

1917

static struct lockdep_map memcg_oom_lock_dep_map = {

1917

static struct lockdep_map memcg_oom_lock_dep_map = {

1918

.name = "memcg_oom_lock",

1918

.name = "memcg_oom_lock",

1919

};

1919

};

1920

#endif

1920

#endif

1921

1922

static DEFINE_SPINLOCK(memcg_oom_lock);

1922

static DEFINE_SPINLOCK(memcg_oom_lock);

1923

1924

/*

1924

/*

1925

* Check OOM-Killer is already running under our hierarchy.

1925

* Check OOM-Killer is already running under our hierarchy.

1926

* If someone is running, return false.

1926

* If someone is running, return false.

1927

*/

1927

*/

1928

static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)

1928

static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)

1929

{

1929

{

1930

struct mem_cgroup *iter, *failed = NULL;

1930

struct mem_cgroup *iter, *failed = NULL;

1931

1932

spin_lock(&memcg_oom_lock);

1932

spin_lock(&memcg_oom_lock);

1933

1934

for_each_mem_cgroup_tree(iter, memcg) {

1934

for_each_mem_cgroup_tree(iter, memcg) {

1935

if (iter->oom_lock) {

1935

if (iter->oom_lock) {

1936

/*

1936

/*

1937

* this subtree of our hierarchy is already locked

1937

* this subtree of our hierarchy is already locked

1938

* so we cannot give a lock.

1938

* so we cannot give a lock.

1939

*/

1939

*/

1940

failed = iter;

1940

failed = iter;

1941

mem_cgroup_iter_break(memcg, iter);

1941

mem_cgroup_iter_break(memcg, iter);

1942

break;

1942

break;

1943

} else

1943

} else

1944

iter->oom_lock = true;

1944

iter->oom_lock = true;

1945

}

1945

}

1946

1947

if (failed) {

1947

if (failed) {

1948

/*

1948

/*

1949

* OK, we failed to lock the whole subtree so we have

1949

* OK, we failed to lock the whole subtree so we have

1950

* to clean up what we set up to the failing subtree

1950

* to clean up what we set up to the failing subtree

1951

*/

1951

*/

1952

for_each_mem_cgroup_tree(iter, memcg) {

1952

for_each_mem_cgroup_tree(iter, memcg) {

1953

if (iter == failed) {

1953

if (iter == failed) {

1954

mem_cgroup_iter_break(memcg, iter);

1954

mem_cgroup_iter_break(memcg, iter);

1955

break;

1955

break;

1956

}

1956

}

1957

iter->oom_lock = false;

1957

iter->oom_lock = false;

1958

}

1958

}

1959

} else

1959

} else

1960

mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);

1960

mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);

1961

1962

spin_unlock(&memcg_oom_lock);

1962

spin_unlock(&memcg_oom_lock);

1963

1964

return !failed;

1964

return !failed;

1965

}

1965

}

1966

1967

static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

1967

static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)

1968

{

1968

{

1969

struct mem_cgroup *iter;

1969

struct mem_cgroup *iter;

1970

1971

spin_lock(&memcg_oom_lock);

1971

spin_lock(&memcg_oom_lock);

1972

mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);

1972

mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);

1973

for_each_mem_cgroup_tree(iter, memcg)

1973

for_each_mem_cgroup_tree(iter, memcg)

1974

iter->oom_lock = false;

1974

iter->oom_lock = false;

1975

spin_unlock(&memcg_oom_lock);

1975

spin_unlock(&memcg_oom_lock);

1976

}

1976

}

1977

1978

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

1978

static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)

1979

{

1979

{

1980

struct mem_cgroup *iter;

1980

struct mem_cgroup *iter;

1981

1982

for_each_mem_cgroup_tree(iter, memcg)

1982

for_each_mem_cgroup_tree(iter, memcg)

1983

atomic_inc(&iter->under_oom);

1983

atomic_inc(&iter->under_oom);

1984

}

1984

}

1985

1986

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

1986

static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)

1987

{

1987

{

1988

struct mem_cgroup *iter;

1988

struct mem_cgroup *iter;

1989

1990

/*

1990

/*

1991

* When a new child is created while the hierarchy is under oom,

1991

* When a new child is created while the hierarchy is under oom,

1992

* mem_cgroup_oom_lock() may not be called. We have to use

1992

* mem_cgroup_oom_lock() may not be called. We have to use

1993

* atomic_add_unless() here.

1993

* atomic_add_unless() here.

1994

*/

1994

*/

1995

for_each_mem_cgroup_tree(iter, memcg)

1995

for_each_mem_cgroup_tree(iter, memcg)

1996

atomic_add_unless(&iter->under_oom, -1, 0);

1996

atomic_add_unless(&iter->under_oom, -1, 0);

1997

}

1997

}

1998

1999

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

1999

static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);

2000

2001

struct oom_wait_info {

2001

struct oom_wait_info {

2002

struct mem_cgroup *memcg;

2002

struct mem_cgroup *memcg;

2003

wait_queue_t wait;

2003

wait_queue_t wait;

2004

};

2004

};

2005

2006

static int memcg_oom_wake_function(wait_queue_t *wait,

2006

static int memcg_oom_wake_function(wait_queue_t *wait,

2007

unsigned mode, int sync, void *arg)

2007

unsigned mode, int sync, void *arg)

2008

{

2008

{

2009

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

2009

struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;

2010

struct mem_cgroup *oom_wait_memcg;

2010

struct mem_cgroup *oom_wait_memcg;

2011

struct oom_wait_info *oom_wait_info;

2011

struct oom_wait_info *oom_wait_info;

2012

2013

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

2013

oom_wait_info = container_of(wait, struct oom_wait_info, wait);

2014

oom_wait_memcg = oom_wait_info->memcg;

2014

oom_wait_memcg = oom_wait_info->memcg;

2015

2016

/*

2016

/*

2017

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

2017

* Both of oom_wait_info->memcg and wake_memcg are stable under us.

2018

* Then we can use css_is_ancestor without taking care of RCU.

2018

* Then we can use css_is_ancestor without taking care of RCU.

2019

*/

2019

*/

2020

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

2020

if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)

2021

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

2021

&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))

2022

return 0;

2022

return 0;

2023

return autoremove_wake_function(wait, mode, sync, arg);

2023

return autoremove_wake_function(wait, mode, sync, arg);

2024

}

2024

}

2025

2026

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

2026

static void memcg_wakeup_oom(struct mem_cgroup *memcg)

2027

{

2027

{

2028

atomic_inc(&memcg->oom_wakeups);

2028

atomic_inc(&memcg->oom_wakeups);

2029

/* for filtering, pass "memcg" as argument. */

2029

/* for filtering, pass "memcg" as argument. */

2030

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

2030

__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);

2031

}

2031

}

2032

2033

static void memcg_oom_recover(struct mem_cgroup *memcg)

2033

static void memcg_oom_recover(struct mem_cgroup *memcg)

2034

{

2034

{

2035

if (memcg && atomic_read(&memcg->under_oom))

2035

if (memcg && atomic_read(&memcg->under_oom))

2036

memcg_wakeup_oom(memcg);

2036

memcg_wakeup_oom(memcg);

2037

}

2037

}

2038

2039

static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)

2039

static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)

2040

{

2040

{

2041

if (!current->memcg_oom.may_oom)

2041

if (!current->memcg_oom.may_oom)

2042

return;

2042

return;

2043

/*

2043

/*

2044

* We are in the middle of the charge context here, so we

2044

* We are in the middle of the charge context here, so we

2045

* don't want to block when potentially sitting on a callstack

2045

* don't want to block when potentially sitting on a callstack

2046

* that holds all kinds of filesystem and mm locks.

2046

* that holds all kinds of filesystem and mm locks.

2047

*

2047

*

2048

* Also, the caller may handle a failed allocation gracefully

2048

* Also, the caller may handle a failed allocation gracefully

2049

* (like optional page cache readahead) and so an OOM killer

2049

* (like optional page cache readahead) and so an OOM killer

2050

* invocation might not even be necessary.

2050

* invocation might not even be necessary.

2051

*

2051

*

2052

* That's why we don't do anything here except remember the

2052

* That's why we don't do anything here except remember the

2053

* OOM context and then deal with it at the end of the page

2053

* OOM context and then deal with it at the end of the page

2054

* fault when the stack is unwound, the locks are released,

2054

* fault when the stack is unwound, the locks are released,

2055

* and when we know whether the fault was overall successful.

2055

* and when we know whether the fault was overall successful.

2056

*/

2056

*/

2057

css_get(&memcg->css);

2057

css_get(&memcg->css);

2058

current->memcg_oom.memcg = memcg;

2058

current->memcg_oom.memcg = memcg;

2059

current->memcg_oom.gfp_mask = mask;

2059

current->memcg_oom.gfp_mask = mask;

2060

current->memcg_oom.order = order;

2060

current->memcg_oom.order = order;

2061

}

2061

}

2062

2063

/**

2063

/**

2064

* mem_cgroup_oom_synchronize - complete memcg OOM handling

2064

* mem_cgroup_oom_synchronize - complete memcg OOM handling

2065

* @handle: actually kill/wait or just clean up the OOM state

2065

* @handle: actually kill/wait or just clean up the OOM state

2066

*

2066

*

2067

* This has to be called at the end of a page fault if the memcg OOM

2067

* This has to be called at the end of a page fault if the memcg OOM

2068

* handler was enabled.

2068

* handler was enabled.

2069

*

2069

*

2070

* Memcg supports userspace OOM handling where failed allocations must

2070

* Memcg supports userspace OOM handling where failed allocations must

2071

* sleep on a waitqueue until the userspace task resolves the

2071

* sleep on a waitqueue until the userspace task resolves the

2072

* situation. Sleeping directly in the charge context with all kinds

2072

* situation. Sleeping directly in the charge context with all kinds

2073

* of locks held is not a good idea, instead we remember an OOM state

2073

* of locks held is not a good idea, instead we remember an OOM state

2074

* in the task and mem_cgroup_oom_synchronize() has to be called at

2074

* in the task and mem_cgroup_oom_synchronize() has to be called at

2075

* the end of the page fault to complete the OOM handling.

2075

* the end of the page fault to complete the OOM handling.

2076

*

2076

*

2077

* Returns %true if an ongoing memcg OOM situation was detected and

2077

* Returns %true if an ongoing memcg OOM situation was detected and

2078

* completed, %false otherwise.

2078

* completed, %false otherwise.

2079

*/

2079

*/

2080

bool mem_cgroup_oom_synchronize(bool handle)

2080

bool mem_cgroup_oom_synchronize(bool handle)

2081

{

2081

{

2082

struct mem_cgroup *memcg = current->memcg_oom.memcg;

2082

struct mem_cgroup *memcg = current->memcg_oom.memcg;

2083

struct oom_wait_info owait;

2083

struct oom_wait_info owait;

2084

bool locked;

2084

bool locked;

2085

2086

/* OOM is global, do not handle */

2086

/* OOM is global, do not handle */

2087

if (!memcg)

2087

if (!memcg)

2088

return false;

2088

return false;

2089

2090

if (!handle)

2090

if (!handle)

2091

goto cleanup;

2091

goto cleanup;

2092

2093

owait.memcg = memcg;

2093

owait.memcg = memcg;

2094

owait.wait.flags = 0;

2094

owait.wait.flags = 0;

2095

owait.wait.func = memcg_oom_wake_function;

2095

owait.wait.func = memcg_oom_wake_function;

2096

owait.wait.private = current;

2096

owait.wait.private = current;

2097

INIT_LIST_HEAD(&owait.wait.task_list);

2097

INIT_LIST_HEAD(&owait.wait.task_list);

2098

2099

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

2099

prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);

2100

mem_cgroup_mark_under_oom(memcg);

2100

mem_cgroup_mark_under_oom(memcg);

2101

2102

locked = mem_cgroup_oom_trylock(memcg);

2102

locked = mem_cgroup_oom_trylock(memcg);

2103

2104

if (locked)

2104

if (locked)

2105

mem_cgroup_oom_notify(memcg);

2105

mem_cgroup_oom_notify(memcg);

2106

2107

if (locked && !memcg->oom_kill_disable) {

2107

if (locked && !memcg->oom_kill_disable) {

2108

mem_cgroup_unmark_under_oom(memcg);

2108

mem_cgroup_unmark_under_oom(memcg);

2109

finish_wait(&memcg_oom_waitq, &owait.wait);

2109

finish_wait(&memcg_oom_waitq, &owait.wait);

2110

mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,

2110

mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,

2111

current->memcg_oom.order);

2111

current->memcg_oom.order);

2112

} else {

2112

} else {

2113

schedule();

2113

schedule();

2114

mem_cgroup_unmark_under_oom(memcg);

2114

mem_cgroup_unmark_under_oom(memcg);

2115

finish_wait(&memcg_oom_waitq, &owait.wait);

2115

finish_wait(&memcg_oom_waitq, &owait.wait);

2116

}

2116

}

2117

2118

if (locked) {

2118

if (locked) {

2119

mem_cgroup_oom_unlock(memcg);

2119

mem_cgroup_oom_unlock(memcg);

2120

/*

2120

/*

2121

* There is no guarantee that an OOM-lock contender

2121

* There is no guarantee that an OOM-lock contender

2122

* sees the wakeups triggered by the OOM kill

2122

* sees the wakeups triggered by the OOM kill

2123

* uncharges. Wake any sleepers explicitely.

2123

* uncharges. Wake any sleepers explicitely.

2124

*/

2124

*/

2125

memcg_oom_recover(memcg);

2125

memcg_oom_recover(memcg);

2126

}

2126

}

2127

cleanup:

2127

cleanup:

2128

current->memcg_oom.memcg = NULL;

2128

current->memcg_oom.memcg = NULL;

2129

css_put(&memcg->css);

2129

css_put(&memcg->css);

2130

return true;

2130

return true;

2131

}

2131

}

2132

2133

/**

2133

/**

2134

* mem_cgroup_begin_page_stat - begin a page state statistics transaction

2134

* mem_cgroup_begin_page_stat - begin a page state statistics transaction

2135

* @page: page that is going to change accounted state

2135

* @page: page that is going to change accounted state

2136

* @locked: &memcg->move_lock slowpath was taken

2136

* @locked: &memcg->move_lock slowpath was taken

2137

* @flags: IRQ-state flags for &memcg->move_lock

2137

* @flags: IRQ-state flags for &memcg->move_lock

2138

*

2138

*

2139

* This function must mark the beginning of an accounted page state

2139

* This function must mark the beginning of an accounted page state

2140

* change to prevent double accounting when the page is concurrently

2140

* change to prevent double accounting when the page is concurrently

2141

* being moved to another memcg:

2141

* being moved to another memcg:

2142

*

2142

*

2143

* memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);

2143

* memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);

2144

* if (TestClearPageState(page))

2144

* if (TestClearPageState(page))

2145

* mem_cgroup_update_page_stat(memcg, state, -1);

2145

* mem_cgroup_update_page_stat(memcg, state, -1);

2146

* mem_cgroup_end_page_stat(memcg, locked, flags);

2146

* mem_cgroup_end_page_stat(memcg, locked, flags);

2147

*

2147

*

2148

* The RCU lock is held throughout the transaction. The fast path can

2148

* The RCU lock is held throughout the transaction. The fast path can

2149

* get away without acquiring the memcg->move_lock (@locked is false)

2149

* get away without acquiring the memcg->move_lock (@locked is false)

2150

* because page moving starts with an RCU grace period.

2150

* because page moving starts with an RCU grace period.

2151

*

2151

*

2152

* The RCU lock also protects the memcg from being freed when the page

2152

* The RCU lock also protects the memcg from being freed when the page

2153

* state that is going to change is the only thing preventing the page

2153

* state that is going to change is the only thing preventing the page

2154

* from being uncharged. E.g. end-writeback clearing PageWriteback(),

2154

* from being uncharged. E.g. end-writeback clearing PageWriteback(),

2155

* which allows migration to go ahead and uncharge the page before the

2155

* which allows migration to go ahead and uncharge the page before the

2156

* account transaction might be complete.

2156

* account transaction might be complete.

2157

*/

2157

*/

2158

struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,

2158

struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,

2159

bool *locked,

2159

bool *locked,

2160

unsigned long *flags)

2160

unsigned long *flags)

2161

{

2161

{

2162

struct mem_cgroup *memcg;

2162

struct mem_cgroup *memcg;

2163

struct page_cgroup *pc;

2163

struct page_cgroup *pc;

2164

2165

rcu_read_lock();

2165

rcu_read_lock();

2166

2167

if (mem_cgroup_disabled())

2167

if (mem_cgroup_disabled())

2168

return NULL;

2168

return NULL;

2169

2170

pc = lookup_page_cgroup(page);

2170

pc = lookup_page_cgroup(page);

2171

again:

2171

again:

2172

memcg = pc->mem_cgroup;

2172

memcg = pc->mem_cgroup;

2173

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2173

if (unlikely(!memcg || !PageCgroupUsed(pc)))

2174

return NULL;

2174

return NULL;

2175

2176

*locked = false;

2176

*locked = false;

2177

if (atomic_read(&memcg->moving_account) <= 0)

2177

if (atomic_read(&memcg->moving_account) <= 0)

2178

return memcg;

2178

return memcg;

2179

2180

move_lock_mem_cgroup(memcg, flags);

2180

move_lock_mem_cgroup(memcg, flags);

2181

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

2181

if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {

2182

move_unlock_mem_cgroup(memcg, flags);

2182

move_unlock_mem_cgroup(memcg, flags);

2183

goto again;

2183

goto again;

2184

}

2184

}

2185

*locked = true;

2185

*locked = true;

2186

2187

return memcg;

2187

return memcg;

2188

}

2188

}

2189

2190

/**

2190

/**

2191

* mem_cgroup_end_page_stat - finish a page state statistics transaction

2191

* mem_cgroup_end_page_stat - finish a page state statistics transaction

2192

* @memcg: the memcg that was accounted against

2192

* @memcg: the memcg that was accounted against

2193

* @locked: value received from mem_cgroup_begin_page_stat()

2193

* @locked: value received from mem_cgroup_begin_page_stat()

2194

* @flags: value received from mem_cgroup_begin_page_stat()

2194

* @flags: value received from mem_cgroup_begin_page_stat()

2195

*/

2195

*/

2196

void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,

2196

void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,

2197

unsigned long flags)

2197

unsigned long flags)

2198

{

2198

{

2199

if (memcg && locked)

2199

if (memcg && locked)

2200

move_unlock_mem_cgroup(memcg, &flags);

2200

move_unlock_mem_cgroup(memcg, &flags);

2201

2202

rcu_read_unlock();

2202

rcu_read_unlock();

2203

}

2203

}

2204

2205

/**

2205

/**

2206

* mem_cgroup_update_page_stat - update page state statistics

2206

* mem_cgroup_update_page_stat - update page state statistics

2207

* @memcg: memcg to account against

2207

* @memcg: memcg to account against

2208

* @idx: page state item to account

2208

* @idx: page state item to account

2209

* @val: number of pages (positive or negative)

2209

* @val: number of pages (positive or negative)

2210

*

2210

*

2211

* See mem_cgroup_begin_page_stat() for locking requirements.

2211

* See mem_cgroup_begin_page_stat() for locking requirements.

2212

*/

2212

*/

2213

void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,

2213

void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,

2214

enum mem_cgroup_stat_index idx, int val)

2214

enum mem_cgroup_stat_index idx, int val)

2215

{

2215

{

2216

VM_BUG_ON(!rcu_read_lock_held());

2216

VM_BUG_ON(!rcu_read_lock_held());

2217

2218

if (memcg)

2218

if (memcg)

2219

this_cpu_add(memcg->stat->count[idx], val);

2219

this_cpu_add(memcg->stat->count[idx], val);

2220

}

2220

}

2221

2222

/*

2222

/*

2223

* size of first charge trial. "32" comes from vmscan.c's magic value.

2223

* size of first charge trial. "32" comes from vmscan.c's magic value.

2224

* TODO: maybe necessary to use big numbers in big irons.

2224

* TODO: maybe necessary to use big numbers in big irons.

2225

*/

2225

*/

2226

#define CHARGE_BATCH 32U

2226

#define CHARGE_BATCH 32U

2227

struct memcg_stock_pcp {

2227

struct memcg_stock_pcp {

2228

struct mem_cgroup *cached; /* this never be root cgroup */

2228

struct mem_cgroup *cached; /* this never be root cgroup */

2229

unsigned int nr_pages;

2229

unsigned int nr_pages;

2230

struct work_struct work;

2230

struct work_struct work;

2231

unsigned long flags;

2231

unsigned long flags;

2232

#define FLUSHING_CACHED_CHARGE 0

2232

#define FLUSHING_CACHED_CHARGE 0

2233

};

2233

};

2234

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2234

static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);

2235

static DEFINE_MUTEX(percpu_charge_mutex);

2235

static DEFINE_MUTEX(percpu_charge_mutex);

2236

2237

/**

2237

/**

2238

* consume_stock: Try to consume stocked charge on this cpu.

2238

* consume_stock: Try to consume stocked charge on this cpu.

2239

* @memcg: memcg to consume from.

2239

* @memcg: memcg to consume from.

2240

* @nr_pages: how many pages to charge.

2240

* @nr_pages: how many pages to charge.

2241

*

2241

*

2242

* The charges will only happen if @memcg matches the current cpu's memcg

2242

* The charges will only happen if @memcg matches the current cpu's memcg

2243

* stock, and at least @nr_pages are available in that stock. Failure to

2243

* stock, and at least @nr_pages are available in that stock. Failure to

2244

* service an allocation will refill the stock.

2244

* service an allocation will refill the stock.

2245

*

2245

*

2246

* returns true if successful, false otherwise.

2246

* returns true if successful, false otherwise.

2247

*/

2247

*/

2248

static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2248

static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2249

{

2249

{

2250

struct memcg_stock_pcp *stock;

2250

struct memcg_stock_pcp *stock;

2251

bool ret = false;

2251

bool ret = false;

2252

2253

if (nr_pages > CHARGE_BATCH)

2253

if (nr_pages > CHARGE_BATCH)

2254

return ret;

2254

return ret;

2255

2256

stock = &get_cpu_var(memcg_stock);

2256

stock = &get_cpu_var(memcg_stock);

2257

if (memcg == stock->cached && stock->nr_pages >= nr_pages) {

2257

if (memcg == stock->cached && stock->nr_pages >= nr_pages) {

2258

stock->nr_pages -= nr_pages;

2258

stock->nr_pages -= nr_pages;

2259

ret = true;

2259

ret = true;

2260

}

2260

}

2261

put_cpu_var(memcg_stock);

2261

put_cpu_var(memcg_stock);

2262

return ret;

2262

return ret;

2263

}

2263

}

2264

2265

/*

2265

/*

2266

* Returns stocks cached in percpu and reset cached information.

2266

* Returns stocks cached in percpu and reset cached information.

2267

*/

2267

*/

2268

static void drain_stock(struct memcg_stock_pcp *stock)

2268

static void drain_stock(struct memcg_stock_pcp *stock)

2269

{

2269

{

2270

struct mem_cgroup *old = stock->cached;

2270

struct mem_cgroup *old = stock->cached;

2271

2272

if (stock->nr_pages) {

2272

if (stock->nr_pages) {

2273

page_counter_uncharge(&old->memory, stock->nr_pages);

2273

page_counter_uncharge(&old->memory, stock->nr_pages);

2274

if (do_swap_account)

2274

if (do_swap_account)

2275

page_counter_uncharge(&old->memsw, stock->nr_pages);

2275

page_counter_uncharge(&old->memsw, stock->nr_pages);

2276

css_put_many(&old->css, stock->nr_pages);

2276

stock->nr_pages = 0;

2277

stock->nr_pages = 0;

2277

}

2278

}

2278

stock->cached = NULL;

2279

stock->cached = NULL;

2279

}

2280

}

2280

2281

/*

2282

/*

2282

* This must be called under preempt disabled or must be called by

2283

* This must be called under preempt disabled or must be called by

2283

* a thread which is pinned to local cpu.

2284

* a thread which is pinned to local cpu.

2284

*/

2285

*/

2285

static void drain_local_stock(struct work_struct *dummy)

2286

static void drain_local_stock(struct work_struct *dummy)

2286

{

2287

{

2287

struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);

2288

struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);

2288

drain_stock(stock);

2289

drain_stock(stock);

2289

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2290

clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);

2290

}

2291

}

2291

2292

static void __init memcg_stock_init(void)

2293

static void __init memcg_stock_init(void)

2293

{

2294

{

2294

int cpu;

2295

int cpu;

2295

2296

for_each_possible_cpu(cpu) {

2297

for_each_possible_cpu(cpu) {

2297

struct memcg_stock_pcp *stock =

2298

struct memcg_stock_pcp *stock =

2298

&per_cpu(memcg_stock, cpu);

2299

&per_cpu(memcg_stock, cpu);

2299

INIT_WORK(&stock->work, drain_local_stock);

2300

INIT_WORK(&stock->work, drain_local_stock);

2300

}

2301

}

2301

}

2302

}

2302

2303

/*

2304

/*

2304

* Cache charges(val) to local per_cpu area.

2305

* Cache charges(val) to local per_cpu area.

2305

* This will be consumed by consume_stock() function, later.

2306

* This will be consumed by consume_stock() function, later.

2306

*/

2307

*/

2307

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2308

static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)

2308

{

2309

{

2309

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2310

struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);

2310

2311

if (stock->cached != memcg) { /* reset if necessary */

2312

if (stock->cached != memcg) { /* reset if necessary */

2312

drain_stock(stock);

2313

drain_stock(stock);

2313

stock->cached = memcg;

2314

stock->cached = memcg;

2314

}

2315

}

2315

stock->nr_pages += nr_pages;

2316

stock->nr_pages += nr_pages;

2316

put_cpu_var(memcg_stock);

2317

put_cpu_var(memcg_stock);

2317

}

2318

}

2318

2319

/*

2320

/*

2320

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2321

* Drains all per-CPU charge caches for given root_memcg resp. subtree

2321

* of the hierarchy under it. sync flag says whether we should block

2322

* of the hierarchy under it. sync flag says whether we should block

2322

* until the work is done.

2323

* until the work is done.

2323

*/

2324

*/

2324

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2325

static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)

2325

{

2326

{

2326

int cpu, curcpu;

2327

int cpu, curcpu;

2327

2328

/* Notify other cpus that system-wide "drain" is running */

2329

/* Notify other cpus that system-wide "drain" is running */

2329

get_online_cpus();

2330

get_online_cpus();

2330

curcpu = get_cpu();

2331

curcpu = get_cpu();

2331

for_each_online_cpu(cpu) {

2332

for_each_online_cpu(cpu) {

2332

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2333

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2333

struct mem_cgroup *memcg;

2334

struct mem_cgroup *memcg;

2334

2335

memcg = stock->cached;

2336

memcg = stock->cached;

2336

if (!memcg || !stock->nr_pages)

2337

if (!memcg || !stock->nr_pages)

2337

continue;

2338

continue;

2338

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2339

if (!mem_cgroup_same_or_subtree(root_memcg, memcg))

2339

continue;

2340

continue;

2340

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2341

if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {

2341

if (cpu == curcpu)

2342

if (cpu == curcpu)

2342

drain_local_stock(&stock->work);

2343

drain_local_stock(&stock->work);

2343

else

2344

else

2344

schedule_work_on(cpu, &stock->work);

2345

schedule_work_on(cpu, &stock->work);

2345

}

2346

}

2346

}

2347

}

2347

put_cpu();

2348

put_cpu();

2348

2349

if (!sync)

2350

if (!sync)

2350

goto out;

2351

goto out;

2351

2352

for_each_online_cpu(cpu) {

2353

for_each_online_cpu(cpu) {

2353

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2354

struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);

2354

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2355

if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))

2355

flush_work(&stock->work);

2356

flush_work(&stock->work);

2356

}

2357

}

2357

out:

2358

out:

2358

put_online_cpus();

2359

put_online_cpus();

2359

}

2360

}

2360

2361

/*

2362

/*

2362

* Tries to drain stocked charges in other cpus. This function is asynchronous

2363

* Tries to drain stocked charges in other cpus. This function is asynchronous

2363

* and just put a work per cpu for draining localy on each cpu. Caller can

2364

* and just put a work per cpu for draining localy on each cpu. Caller can

2364

* expects some charges will be back later but cannot wait for it.

2365

* expects some charges will be back later but cannot wait for it.

2365

*/

2366

*/

2366

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2367

static void drain_all_stock_async(struct mem_cgroup *root_memcg)

2367

{

2368

{

2368

/*

2369

/*

2369

* If someone calls draining, avoid adding more kworker runs.

2370

* If someone calls draining, avoid adding more kworker runs.

2370

*/

2371

*/

2371

if (!mutex_trylock(&percpu_charge_mutex))

2372

if (!mutex_trylock(&percpu_charge_mutex))

2372

return;

2373

return;

2373

drain_all_stock(root_memcg, false);

2374

drain_all_stock(root_memcg, false);

2374

mutex_unlock(&percpu_charge_mutex);

2375

mutex_unlock(&percpu_charge_mutex);

2375

}

2376

}

2376

2377

/* This is a synchronous drain interface. */

2378

/* This is a synchronous drain interface. */

2378

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2379

static void drain_all_stock_sync(struct mem_cgroup *root_memcg)

2379

{

2380

{

2380

/* called when force_empty is called */

2381

/* called when force_empty is called */

2381

mutex_lock(&percpu_charge_mutex);

2382

mutex_lock(&percpu_charge_mutex);

2382

drain_all_stock(root_memcg, true);

2383

drain_all_stock(root_memcg, true);

2383

mutex_unlock(&percpu_charge_mutex);

2384

mutex_unlock(&percpu_charge_mutex);

2384

}

2385

}

2385

2386

/*

2387

/*

2387

* This function drains percpu counter value from DEAD cpu and

2388

* This function drains percpu counter value from DEAD cpu and

2388

* move it to local cpu. Note that this function can be preempted.

2389

* move it to local cpu. Note that this function can be preempted.

2389

*/

2390

*/

2390

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2391

static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)

2391

{

2392

{

2392

int i;

2393

int i;

2393

2394

spin_lock(&memcg->pcp_counter_lock);

2395

spin_lock(&memcg->pcp_counter_lock);

2395

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2396

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

2396

long x = per_cpu(memcg->stat->count[i], cpu);

2397

long x = per_cpu(memcg->stat->count[i], cpu);

2397

2398

per_cpu(memcg->stat->count[i], cpu) = 0;

2399

per_cpu(memcg->stat->count[i], cpu) = 0;

2399

memcg->nocpu_base.count[i] += x;

2400

memcg->nocpu_base.count[i] += x;

2400

}

2401

}

2401

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2402

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

2402

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2403

unsigned long x = per_cpu(memcg->stat->events[i], cpu);

2403

2404

per_cpu(memcg->stat->events[i], cpu) = 0;

2405

per_cpu(memcg->stat->events[i], cpu) = 0;

2405

memcg->nocpu_base.events[i] += x;

2406

memcg->nocpu_base.events[i] += x;

2406

}

2407

}

2407

spin_unlock(&memcg->pcp_counter_lock);

2408

spin_unlock(&memcg->pcp_counter_lock);

2408

}

2409

}

2409

2410

static int memcg_cpu_hotplug_callback(struct notifier_block *nb,

2411

static int memcg_cpu_hotplug_callback(struct notifier_block *nb,

2411

unsigned long action,

2412

unsigned long action,

2412

void *hcpu)

2413

void *hcpu)

2413

{

2414

{

2414

int cpu = (unsigned long)hcpu;

2415

int cpu = (unsigned long)hcpu;

2415

struct memcg_stock_pcp *stock;

2416

struct memcg_stock_pcp *stock;

2416

struct mem_cgroup *iter;

2417

struct mem_cgroup *iter;

2417

2418

if (action == CPU_ONLINE)

2419

if (action == CPU_ONLINE)

2419

return NOTIFY_OK;

2420

return NOTIFY_OK;

2420

2421

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2422

if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)

2422

return NOTIFY_OK;

2423

return NOTIFY_OK;

2423

2424

for_each_mem_cgroup(iter)

2425

for_each_mem_cgroup(iter)

2425

mem_cgroup_drain_pcp_counter(iter, cpu);

2426

mem_cgroup_drain_pcp_counter(iter, cpu);

2426

2427

stock = &per_cpu(memcg_stock, cpu);

2428

stock = &per_cpu(memcg_stock, cpu);

2428

drain_stock(stock);

2429

drain_stock(stock);

2429

return NOTIFY_OK;

2430

return NOTIFY_OK;

2430

}

2431

}

2431

2432

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2433

static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,

2433

unsigned int nr_pages)

2434

unsigned int nr_pages)

2434

{

2435

{

2435

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2436

unsigned int batch = max(CHARGE_BATCH, nr_pages);

2436

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

2437

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

2437

struct mem_cgroup *mem_over_limit;

2438

struct mem_cgroup *mem_over_limit;

2438

struct page_counter *counter;

2439

struct page_counter *counter;

2439

unsigned long nr_reclaimed;

2440

unsigned long nr_reclaimed;

2440

bool may_swap = true;

2441

bool may_swap = true;

2441

bool drained = false;

2442

bool drained = false;

2442

int ret = 0;

2443

int ret = 0;

2443

2444

if (mem_cgroup_is_root(memcg))

2445

if (mem_cgroup_is_root(memcg))

2445

goto done;

2446

goto done;

2446

retry:

2447

retry:

2447

if (consume_stock(memcg, nr_pages))

2448

if (consume_stock(memcg, nr_pages))

2448

goto done;

2449

goto done;

2449

2450

if (!do_swap_account ||

2451

if (!do_swap_account ||

2451

!page_counter_try_charge(&memcg->memsw, batch, &counter)) {

2452

!page_counter_try_charge(&memcg->memsw, batch, &counter)) {

2452

if (!page_counter_try_charge(&memcg->memory, batch, &counter))

2453

if (!page_counter_try_charge(&memcg->memory, batch, &counter))

2453

goto done_restock;

2454

goto done_restock;

2454

if (do_swap_account)

2455

if (do_swap_account)

2455

page_counter_uncharge(&memcg->memsw, batch);

2456

page_counter_uncharge(&memcg->memsw, batch);

2456

mem_over_limit = mem_cgroup_from_counter(counter, memory);

2457

mem_over_limit = mem_cgroup_from_counter(counter, memory);

2457

} else {

2458

} else {

2458

mem_over_limit = mem_cgroup_from_counter(counter, memsw);

2459

mem_over_limit = mem_cgroup_from_counter(counter, memsw);

2459

may_swap = false;

2460

may_swap = false;

2460

}

2461

}

2461

2462

if (batch > nr_pages) {

2463

if (batch > nr_pages) {

2463

batch = nr_pages;

2464

batch = nr_pages;

2464

goto retry;

2465

goto retry;

2465

}

2466

}

2466

2467

/*

2468

/*

2468

* Unlike in global OOM situations, memcg is not in a physical

2469

* Unlike in global OOM situations, memcg is not in a physical

2469

* memory shortage. Allow dying and OOM-killed tasks to

2470

* memory shortage. Allow dying and OOM-killed tasks to

2470

* bypass the last charges so that they can exit quickly and

2471

* bypass the last charges so that they can exit quickly and

2471

* free their memory.

2472

* free their memory.

2472

*/

2473

*/

2473

if (unlikely(test_thread_flag(TIF_MEMDIE) ||

2474

if (unlikely(test_thread_flag(TIF_MEMDIE) ||

2474

fatal_signal_pending(current) ||

2475

fatal_signal_pending(current) ||

2475

current->flags & PF_EXITING))

2476

current->flags & PF_EXITING))

2476

goto bypass;

2477

goto bypass;

2477

2478

if (unlikely(task_in_memcg_oom(current)))

2479

if (unlikely(task_in_memcg_oom(current)))

2479

goto nomem;

2480

goto nomem;

2480

2481

if (!(gfp_mask & __GFP_WAIT))

2482

if (!(gfp_mask & __GFP_WAIT))

2482

goto nomem;

2483

goto nomem;

2483

2484

nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,

2485

nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,

2485

gfp_mask, may_swap);

2486

gfp_mask, may_swap);

2486

2487

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2488

if (mem_cgroup_margin(mem_over_limit) >= nr_pages)

2488

goto retry;

2489

goto retry;

2489

2490

if (!drained) {

2491

if (!drained) {

2491

drain_all_stock_async(mem_over_limit);

2492

drain_all_stock_async(mem_over_limit);

2492

drained = true;

2493

drained = true;

2493

goto retry;

2494

goto retry;

2494

}

2495

}

2495

2496

if (gfp_mask & __GFP_NORETRY)

2497

if (gfp_mask & __GFP_NORETRY)

2497

goto nomem;

2498

goto nomem;

2498

/*

2499

/*

2499

* Even though the limit is exceeded at this point, reclaim

2500

* Even though the limit is exceeded at this point, reclaim

2500

* may have been able to free some pages. Retry the charge

2501

* may have been able to free some pages. Retry the charge

2501

* before killing the task.

2502

* before killing the task.

2502

*

2503

*

2503

* Only for regular pages, though: huge pages are rather

2504

* Only for regular pages, though: huge pages are rather

2504

* unlikely to succeed so close to the limit, and we fall back

2505

* unlikely to succeed so close to the limit, and we fall back

2505

* to regular pages anyway in case of failure.

2506

* to regular pages anyway in case of failure.

2506

*/

2507

*/

2507

if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))

2508

if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))

2508

goto retry;

2509

goto retry;

2509

/*

2510

/*

2510

* At task move, charge accounts can be doubly counted. So, it's

2511

* At task move, charge accounts can be doubly counted. So, it's

2511

* better to wait until the end of task_move if something is going on.

2512

* better to wait until the end of task_move if something is going on.

2512

*/

2513

*/

2513

if (mem_cgroup_wait_acct_move(mem_over_limit))

2514

if (mem_cgroup_wait_acct_move(mem_over_limit))

2514

goto retry;

2515

goto retry;

2515

2516

if (nr_retries--)

2517

if (nr_retries--)

2517

goto retry;

2518

goto retry;

2518

2519

if (gfp_mask & __GFP_NOFAIL)

2520

if (gfp_mask & __GFP_NOFAIL)

2520

goto bypass;

2521

goto bypass;

2521

2522

if (fatal_signal_pending(current))

2523

if (fatal_signal_pending(current))

2523

goto bypass;

2524

goto bypass;

2524

2525

mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));

2526

mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));

2526

nomem:

2527

nomem:

2527

if (!(gfp_mask & __GFP_NOFAIL))

2528

if (!(gfp_mask & __GFP_NOFAIL))

2528

return -ENOMEM;

2529

return -ENOMEM;

2529

bypass:

2530

bypass:

2530

return -EINTR;

2531

return -EINTR;

2531

2532

done_restock:

2533

done_restock:

2534

css_get_many(&memcg->css, batch);

2533

if (batch > nr_pages)

2535

if (batch > nr_pages)

2534

refill_stock(memcg, batch - nr_pages);

2536

refill_stock(memcg, batch - nr_pages);

2535

done:

2537

done:

2536

return ret;

2538

return ret;

2537

}

2539

}

2538

2540

2539

static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)

2541

static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)

2540

{

2542

{

2541

if (mem_cgroup_is_root(memcg))

2543

if (mem_cgroup_is_root(memcg))

2542

return;

2544

return;

2543

2545

2544

page_counter_uncharge(&memcg->memory, nr_pages);

2546

page_counter_uncharge(&memcg->memory, nr_pages);

2545

if (do_swap_account)

2547

if (do_swap_account)

2546

page_counter_uncharge(&memcg->memsw, nr_pages);

2548

page_counter_uncharge(&memcg->memsw, nr_pages);

2549

2550

css_put_many(&memcg->css, nr_pages);

2547

}

2551

}

2548

2552

2549

/*

2553

/*

2550

* A helper function to get mem_cgroup from ID. must be called under

2554

* A helper function to get mem_cgroup from ID. must be called under

2551

* rcu_read_lock(). The caller is responsible for calling

2555

* rcu_read_lock(). The caller is responsible for calling

2552

* css_tryget_online() if the mem_cgroup is used for charging. (dropping

2556

* css_tryget_online() if the mem_cgroup is used for charging. (dropping

2553

* refcnt from swap can be called against removed memcg.)

2557

* refcnt from swap can be called against removed memcg.)

2554

*/

2558

*/

2555

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2559

static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)

2556

{

2560

{

2557

/* ID 0 is unused ID */

2561

/* ID 0 is unused ID */

2558

if (!id)

2562

if (!id)

2559

return NULL;

2563

return NULL;

2560

return mem_cgroup_from_id(id);

2564

return mem_cgroup_from_id(id);

2561

}

2565

}

2562

2566

2563

/*

2567

/*

2564

* try_get_mem_cgroup_from_page - look up page's memcg association

2568

* try_get_mem_cgroup_from_page - look up page's memcg association

2565

* @page: the page

2569

* @page: the page

2566

*

2570

*

2567

* Look up, get a css reference, and return the memcg that owns @page.

2571

* Look up, get a css reference, and return the memcg that owns @page.

2568

*

2572

*

2569

* The page must be locked to prevent racing with swap-in and page

2573

* The page must be locked to prevent racing with swap-in and page

2570

* cache charges. If coming from an unlocked page table, the caller

2574

* cache charges. If coming from an unlocked page table, the caller

2571

* must ensure the page is on the LRU or this can race with charging.

2575

* must ensure the page is on the LRU or this can race with charging.

2572

*/

2576

*/

2573

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2577

struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)

2574

{

2578

{

2575

struct mem_cgroup *memcg = NULL;

2579

struct mem_cgroup *memcg = NULL;

2576

struct page_cgroup *pc;

2580

struct page_cgroup *pc;

2577

unsigned short id;

2581

unsigned short id;

2578

swp_entry_t ent;

2582

swp_entry_t ent;

2579

2583

2580

VM_BUG_ON_PAGE(!PageLocked(page), page);

2584

VM_BUG_ON_PAGE(!PageLocked(page), page);

2581

2585

2582

pc = lookup_page_cgroup(page);

2586

pc = lookup_page_cgroup(page);

2583

if (PageCgroupUsed(pc)) {

2587

if (PageCgroupUsed(pc)) {

2584

memcg = pc->mem_cgroup;

2588

memcg = pc->mem_cgroup;

2585

if (memcg && !css_tryget_online(&memcg->css))

2589

if (memcg && !css_tryget_online(&memcg->css))

2586

memcg = NULL;

2590

memcg = NULL;

2587

} else if (PageSwapCache(page)) {

2591

} else if (PageSwapCache(page)) {

2588

ent.val = page_private(page);

2592

ent.val = page_private(page);

2589

id = lookup_swap_cgroup_id(ent);

2593

id = lookup_swap_cgroup_id(ent);

2590

rcu_read_lock();

2594

rcu_read_lock();

2591

memcg = mem_cgroup_lookup(id);

2595

memcg = mem_cgroup_lookup(id);

2592

if (memcg && !css_tryget_online(&memcg->css))

2596

if (memcg && !css_tryget_online(&memcg->css))

2593

memcg = NULL;

2597

memcg = NULL;

2594

rcu_read_unlock();

2598

rcu_read_unlock();

2595

}

2599

}

2596

return memcg;

2600

return memcg;

2597

}

2601

}

2598

2602

2599

static void lock_page_lru(struct page *page, int *isolated)

2603

static void lock_page_lru(struct page *page, int *isolated)

2600

{

2604

{

2601

struct zone *zone = page_zone(page);

2605

struct zone *zone = page_zone(page);

2602

2606

2603

spin_lock_irq(&zone->lru_lock);

2607

spin_lock_irq(&zone->lru_lock);

2604

if (PageLRU(page)) {

2608

if (PageLRU(page)) {

2605

struct lruvec *lruvec;

2609

struct lruvec *lruvec;

2606

2610

2607

lruvec = mem_cgroup_page_lruvec(page, zone);

2611

lruvec = mem_cgroup_page_lruvec(page, zone);

2608

ClearPageLRU(page);

2612

ClearPageLRU(page);

2609

del_page_from_lru_list(page, lruvec, page_lru(page));

2613

del_page_from_lru_list(page, lruvec, page_lru(page));

2610

*isolated = 1;

2614

*isolated = 1;

2611

} else

2615

} else

2612

*isolated = 0;

2616

*isolated = 0;

2613

}

2617

}

2614

2618

2615

static void unlock_page_lru(struct page *page, int isolated)

2619

static void unlock_page_lru(struct page *page, int isolated)

2616

{

2620

{

2617

struct zone *zone = page_zone(page);

2621

struct zone *zone = page_zone(page);

2618

2622

2619

if (isolated) {

2623

if (isolated) {

2620

struct lruvec *lruvec;

2624

struct lruvec *lruvec;

2621

2625

2622

lruvec = mem_cgroup_page_lruvec(page, zone);

2626

lruvec = mem_cgroup_page_lruvec(page, zone);

2623

VM_BUG_ON_PAGE(PageLRU(page), page);

2627

VM_BUG_ON_PAGE(PageLRU(page), page);

2624

SetPageLRU(page);

2628

SetPageLRU(page);

2625

add_page_to_lru_list(page, lruvec, page_lru(page));

2629

add_page_to_lru_list(page, lruvec, page_lru(page));

2626

}

2630

}

2627

spin_unlock_irq(&zone->lru_lock);

2631

spin_unlock_irq(&zone->lru_lock);

2628

}

2632

}

2629

2633

2630

static void commit_charge(struct page *page, struct mem_cgroup *memcg,

2634

static void commit_charge(struct page *page, struct mem_cgroup *memcg,

2631

bool lrucare)

2635

bool lrucare)

2632

{

2636

{

2633

struct page_cgroup *pc = lookup_page_cgroup(page);

2637

struct page_cgroup *pc = lookup_page_cgroup(page);

2634

int isolated;

2638

int isolated;

2635

2639

2636

VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);

2640

VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);

2637

/*

2641

/*

2638

* we don't need page_cgroup_lock about tail pages, becase they are not

2642

* we don't need page_cgroup_lock about tail pages, becase they are not

2639

* accessed by any other context at this point.

2643

* accessed by any other context at this point.

2640

*/

2644

*/

2641

2645

2642

/*

2646

/*

2643

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2647

* In some cases, SwapCache and FUSE(splice_buf->radixtree), the page

2644

* may already be on some other mem_cgroup's LRU. Take care of it.

2648

* may already be on some other mem_cgroup's LRU. Take care of it.

2645

*/

2649

*/

2646

if (lrucare)

2650

if (lrucare)

2647

lock_page_lru(page, &isolated);

2651

lock_page_lru(page, &isolated);

2648

2652

2649

/*

2653

/*

2650

* Nobody should be changing or seriously looking at

2654

* Nobody should be changing or seriously looking at

2651

* pc->mem_cgroup and pc->flags at this point:

2655

* pc->mem_cgroup and pc->flags at this point:

2652

*

2656

*

2653

* - the page is uncharged

2657

* - the page is uncharged

2654

*

2658

*

2655

* - the page is off-LRU

2659

* - the page is off-LRU

2656

*

2660

*

2657

* - an anonymous fault has exclusive page access, except for

2661

* - an anonymous fault has exclusive page access, except for

2658

* a locked page table

2662

* a locked page table

2659

*

2663

*

2660

* - a page cache insertion, a swapin fault, or a migration

2664

* - a page cache insertion, a swapin fault, or a migration

2661

* have the page locked

2665

* have the page locked

2662

*/

2666

*/

2663

pc->mem_cgroup = memcg;

2667

pc->mem_cgroup = memcg;

2664

pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);

2668

pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);

2665

2669

2666

if (lrucare)

2670

if (lrucare)

2667

unlock_page_lru(page, isolated);

2671

unlock_page_lru(page, isolated);

2668

}

2672

}

2669

2673

2670

#ifdef CONFIG_MEMCG_KMEM

2674

#ifdef CONFIG_MEMCG_KMEM

2671

/*

2675

/*

2672

* The memcg_slab_mutex is held whenever a per memcg kmem cache is created or

2676

* The memcg_slab_mutex is held whenever a per memcg kmem cache is created or

2673

* destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.

2677

* destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.

2674

*/

2678

*/

2675

static DEFINE_MUTEX(memcg_slab_mutex);

2679

static DEFINE_MUTEX(memcg_slab_mutex);

2676

2680

2677

static DEFINE_MUTEX(activate_kmem_mutex);

2681

static DEFINE_MUTEX(activate_kmem_mutex);

2678

2682

2679

/*

2683

/*

2680

* This is a bit cumbersome, but it is rarely used and avoids a backpointer

2684

* This is a bit cumbersome, but it is rarely used and avoids a backpointer

2681

* in the memcg_cache_params struct.

2685

* in the memcg_cache_params struct.

2682

*/

2686

*/

2683

static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)

2687

static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)

2684

{

2688

{

2685

struct kmem_cache *cachep;

2689

struct kmem_cache *cachep;

2686

2690

2687

VM_BUG_ON(p->is_root_cache);

2691

VM_BUG_ON(p->is_root_cache);

2688

cachep = p->root_cache;

2692

cachep = p->root_cache;

2689

return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));

2693

return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));

2690

}

2694

}

2691

2695

2692

#ifdef CONFIG_SLABINFO

2696

#ifdef CONFIG_SLABINFO

2693

static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)

2697

static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)

2694

{

2698

{

2695

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

2699

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

2696

struct memcg_cache_params *params;

2700

struct memcg_cache_params *params;

2697

2701

2698

if (!memcg_kmem_is_active(memcg))

2702

if (!memcg_kmem_is_active(memcg))

2699

return -EIO;

2703

return -EIO;

2700

2704

2701

print_slabinfo_header(m);

2705

print_slabinfo_header(m);

2702

2706

2703

mutex_lock(&memcg_slab_mutex);

2707

mutex_lock(&memcg_slab_mutex);

2704

list_for_each_entry(params, &memcg->memcg_slab_caches, list)

2708

list_for_each_entry(params, &memcg->memcg_slab_caches, list)

2705

cache_show(memcg_params_to_cache(params), m);

2709

cache_show(memcg_params_to_cache(params), m);

2706

mutex_unlock(&memcg_slab_mutex);

2710

mutex_unlock(&memcg_slab_mutex);

2707

2711

2708

return 0;

2712

return 0;

2709

}

2713

}

2710

#endif

2714

#endif

2711

2715

2712

static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,

2716

static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,

2713

unsigned long nr_pages)

2717

unsigned long nr_pages)

2714

{

2718

{

2715

struct page_counter *counter;

2719

struct page_counter *counter;

2716

int ret = 0;

2720

int ret = 0;

2717

2721

2718

ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);

2722

ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);

2719

if (ret < 0)

2723

if (ret < 0)

2720

return ret;

2724

return ret;

2721

2725

2722

ret = try_charge(memcg, gfp, nr_pages);

2726

ret = try_charge(memcg, gfp, nr_pages);

2723

if (ret == -EINTR) {

2727

if (ret == -EINTR) {

2724

/*

2728

/*

2725

* try_charge() chose to bypass to root due to OOM kill or

2729

* try_charge() chose to bypass to root due to OOM kill or

2726

* fatal signal. Since our only options are to either fail

2730

* fatal signal. Since our only options are to either fail

2727

* the allocation or charge it to this cgroup, do it as a

2731

* the allocation or charge it to this cgroup, do it as a

2728

* temporary condition. But we can't fail. From a kmem/slab

2732

* temporary condition. But we can't fail. From a kmem/slab

2729

* perspective, the cache has already been selected, by

2733

* perspective, the cache has already been selected, by

2730

* mem_cgroup_kmem_get_cache(), so it is too late to change

2734

* mem_cgroup_kmem_get_cache(), so it is too late to change

2731

* our minds.

2735

* our minds.

2732

*

2736

*

2733

* This condition will only trigger if the task entered

2737

* This condition will only trigger if the task entered

2734

* memcg_charge_kmem in a sane state, but was OOM-killed

2738

* memcg_charge_kmem in a sane state, but was OOM-killed

2735

* during try_charge() above. Tasks that were already dying

2739

* during try_charge() above. Tasks that were already dying

2736

* when the allocation triggers should have been already

2740

* when the allocation triggers should have been already

2737

* directed to the root cgroup in memcontrol.h

2741

* directed to the root cgroup in memcontrol.h

2738

*/

2742

*/

2739

page_counter_charge(&memcg->memory, nr_pages);

2743

page_counter_charge(&memcg->memory, nr_pages);

2740

if (do_swap_account)

2744

if (do_swap_account)

2741

page_counter_charge(&memcg->memsw, nr_pages);

2745

page_counter_charge(&memcg->memsw, nr_pages);

2746

css_get_many(&memcg->css, nr_pages);

2742

ret = 0;

2747

ret = 0;

2743

} else if (ret)

2748

} else if (ret)

2744

page_counter_uncharge(&memcg->kmem, nr_pages);

2749

page_counter_uncharge(&memcg->kmem, nr_pages);

2745

2750

2746

return ret;

2751

return ret;

2747

}

2752

}

2748

2753

2749

static void memcg_uncharge_kmem(struct mem_cgroup *memcg,

2754

static void memcg_uncharge_kmem(struct mem_cgroup *memcg,

2750

unsigned long nr_pages)

2755

unsigned long nr_pages)

2751

{

2756

{

2752

page_counter_uncharge(&memcg->memory, nr_pages);

2757

page_counter_uncharge(&memcg->memory, nr_pages);

2753

if (do_swap_account)

2758

if (do_swap_account)

2754

page_counter_uncharge(&memcg->memsw, nr_pages);

2759

page_counter_uncharge(&memcg->memsw, nr_pages);

2755

2760

2756

/* Not down to 0 */

2761

/* Not down to 0 */

2757

if (page_counter_uncharge(&memcg->kmem, nr_pages))

2762

if (page_counter_uncharge(&memcg->kmem, nr_pages)) {

2763

css_put_many(&memcg->css, nr_pages);

2758

return;

2764

return;

2765

}

2759

2766

2760

/*

2767

/*

2761

* Releases a reference taken in kmem_cgroup_css_offline in case

2768

* Releases a reference taken in kmem_cgroup_css_offline in case

2762

* this last uncharge is racing with the offlining code or it is

2769

* this last uncharge is racing with the offlining code or it is

2763

* outliving the memcg existence.

2770

* outliving the memcg existence.

2764

*

2771

*

2765

* The memory barrier imposed by test&clear is paired with the

2772

* The memory barrier imposed by test&clear is paired with the

2766

* explicit one in memcg_kmem_mark_dead().

2773

* explicit one in memcg_kmem_mark_dead().

2767

*/

2774

*/

2768

if (memcg_kmem_test_and_clear_dead(memcg))

2775

if (memcg_kmem_test_and_clear_dead(memcg))

2769

css_put(&memcg->css);

2776

css_put(&memcg->css);

2777

2778

css_put_many(&memcg->css, nr_pages);

2770

}

2779

}

2771

2780

2772

/*

2781

/*

2773

* helper for acessing a memcg's index. It will be used as an index in the

2782

* helper for acessing a memcg's index. It will be used as an index in the

2774

* child cache array in kmem_cache, and also to derive its name. This function

2783

* child cache array in kmem_cache, and also to derive its name. This function

2775

* will return -1 when this is not a kmem-limited memcg.

2784

* will return -1 when this is not a kmem-limited memcg.

2776

*/

2785

*/

2777

int memcg_cache_id(struct mem_cgroup *memcg)

2786

int memcg_cache_id(struct mem_cgroup *memcg)

2778

{

2787

{

2779

return memcg ? memcg->kmemcg_id : -1;

2788

return memcg ? memcg->kmemcg_id : -1;

2780

}

2789

}

2781

2790

2782

static int memcg_alloc_cache_id(void)

2791

static int memcg_alloc_cache_id(void)

2783

{

2792

{

2784

int id, size;

2793

int id, size;

2785

int err;

2794

int err;

2786

2795

2787

id = ida_simple_get(&kmem_limited_groups,

2796

id = ida_simple_get(&kmem_limited_groups,

2788

0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);

2797

0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);

2789

if (id < 0)

2798

if (id < 0)

2790

return id;

2799

return id;

2791

2800

2792

if (id < memcg_limited_groups_array_size)

2801

if (id < memcg_limited_groups_array_size)

2793

return id;

2802

return id;

2794

2803

2795

/*

2804

/*

2796

* There's no space for the new id in memcg_caches arrays,

2805

* There's no space for the new id in memcg_caches arrays,

2797

* so we have to grow them.

2806

* so we have to grow them.

2798

*/

2807

*/

2799

2808

2800

size = 2 * (id + 1);

2809

size = 2 * (id + 1);

2801

if (size < MEMCG_CACHES_MIN_SIZE)

2810

if (size < MEMCG_CACHES_MIN_SIZE)

2802

size = MEMCG_CACHES_MIN_SIZE;

2811

size = MEMCG_CACHES_MIN_SIZE;

2803

else if (size > MEMCG_CACHES_MAX_SIZE)

2812

else if (size > MEMCG_CACHES_MAX_SIZE)

2804

size = MEMCG_CACHES_MAX_SIZE;

2813

size = MEMCG_CACHES_MAX_SIZE;

2805

2814

2806

mutex_lock(&memcg_slab_mutex);

2815

mutex_lock(&memcg_slab_mutex);

2807

err = memcg_update_all_caches(size);

2816

err = memcg_update_all_caches(size);

2808

mutex_unlock(&memcg_slab_mutex);

2817

mutex_unlock(&memcg_slab_mutex);

2809

2818

2810

if (err) {

2819

if (err) {

2811

ida_simple_remove(&kmem_limited_groups, id);

2820

ida_simple_remove(&kmem_limited_groups, id);

2812

return err;

2821

return err;

2813

}

2822

}

2814

return id;

2823

return id;

2815

}

2824

}

2816

2825

2817

static void memcg_free_cache_id(int id)

2826

static void memcg_free_cache_id(int id)

2818

{

2827

{

2819

ida_simple_remove(&kmem_limited_groups, id);

2828

ida_simple_remove(&kmem_limited_groups, id);

2820

}

2829

}

2821

2830

2822

/*

2831

/*

2823

* We should update the current array size iff all caches updates succeed. This

2832

* We should update the current array size iff all caches updates succeed. This

2824

* can only be done from the slab side. The slab mutex needs to be held when

2833

* can only be done from the slab side. The slab mutex needs to be held when

2825

* calling this.

2834

* calling this.

2826

*/

2835

*/

2827

void memcg_update_array_size(int num)

2836

void memcg_update_array_size(int num)

2828

{

2837

{

2829

memcg_limited_groups_array_size = num;

2838

memcg_limited_groups_array_size = num;

2830

}

2839

}

2831

2840

2832

static void memcg_register_cache(struct mem_cgroup *memcg,

2841

static void memcg_register_cache(struct mem_cgroup *memcg,

2833

struct kmem_cache *root_cache)

2842

struct kmem_cache *root_cache)

2834

{

2843

{

2835

static char memcg_name_buf[NAME_MAX + 1]; /* protected by

2844

static char memcg_name_buf[NAME_MAX + 1]; /* protected by

2836

memcg_slab_mutex */

2845

memcg_slab_mutex */

2837

struct kmem_cache *cachep;

2846

struct kmem_cache *cachep;

2838

int id;

2847

int id;

2839

2848

2840

lockdep_assert_held(&memcg_slab_mutex);

2849

lockdep_assert_held(&memcg_slab_mutex);

2841

2850

2842

id = memcg_cache_id(memcg);

2851

id = memcg_cache_id(memcg);

2843

2852

2844

/*

2853

/*

2845

* Since per-memcg caches are created asynchronously on first

2854

* Since per-memcg caches are created asynchronously on first

2846

* allocation (see memcg_kmem_get_cache()), several threads can try to

2855

* allocation (see memcg_kmem_get_cache()), several threads can try to

2847

* create the same cache, but only one of them may succeed.

2856

* create the same cache, but only one of them may succeed.

2848

*/

2857

*/

2849

if (cache_from_memcg_idx(root_cache, id))

2858

if (cache_from_memcg_idx(root_cache, id))

2850

return;

2859

return;

2851

2860

2852

cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);

2861

cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);

2853

cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);

2862

cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);

2854

/*

2863

/*

2855

* If we could not create a memcg cache, do not complain, because

2864

* If we could not create a memcg cache, do not complain, because

2856

* that's not critical at all as we can always proceed with the root

2865

* that's not critical at all as we can always proceed with the root

2857

* cache.

2866

* cache.

2858

*/

2867

*/

2859

if (!cachep)

2868

if (!cachep)

2860

return;

2869

return;

2861

2870

2862

css_get(&memcg->css);

2871

css_get(&memcg->css);

2863

list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);

2872

list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);

2864

2873

2865

/*

2874

/*

2866

* Since readers won't lock (see cache_from_memcg_idx()), we need a

2875

* Since readers won't lock (see cache_from_memcg_idx()), we need a

2867

* barrier here to ensure nobody will see the kmem_cache partially

2876

* barrier here to ensure nobody will see the kmem_cache partially

2868

* initialized.

2877

* initialized.

2869

*/

2878

*/

2870

smp_wmb();

2879

smp_wmb();

2871

2880

2872

BUG_ON(root_cache->memcg_params->memcg_caches[id]);

2881

BUG_ON(root_cache->memcg_params->memcg_caches[id]);

2873

root_cache->memcg_params->memcg_caches[id] = cachep;

2882

root_cache->memcg_params->memcg_caches[id] = cachep;

2874

}

2883

}

2875

2884

2876

static void memcg_unregister_cache(struct kmem_cache *cachep)

2885

static void memcg_unregister_cache(struct kmem_cache *cachep)

2877

{

2886

{

2878

struct kmem_cache *root_cache;

2887

struct kmem_cache *root_cache;

2879

struct mem_cgroup *memcg;

2888

struct mem_cgroup *memcg;

2880

int id;

2889

int id;

2881

2890

2882

lockdep_assert_held(&memcg_slab_mutex);

2891

lockdep_assert_held(&memcg_slab_mutex);

2883

2892

2884

BUG_ON(is_root_cache(cachep));

2893

BUG_ON(is_root_cache(cachep));

2885

2894

2886

root_cache = cachep->memcg_params->root_cache;

2895

root_cache = cachep->memcg_params->root_cache;

2887

memcg = cachep->memcg_params->memcg;

2896

memcg = cachep->memcg_params->memcg;

2888

id = memcg_cache_id(memcg);

2897

id = memcg_cache_id(memcg);

2889

2898

2890

BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);

2899

BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);

2891

root_cache->memcg_params->memcg_caches[id] = NULL;

2900

root_cache->memcg_params->memcg_caches[id] = NULL;

2892

2901

2893

list_del(&cachep->memcg_params->list);

2902

list_del(&cachep->memcg_params->list);

2894

2903

2895

kmem_cache_destroy(cachep);

2904

kmem_cache_destroy(cachep);

2896

2905

2897

/* drop the reference taken in memcg_register_cache */

2906

/* drop the reference taken in memcg_register_cache */

2898

css_put(&memcg->css);

2907

css_put(&memcg->css);

2899

}

2908

}

2900

2909

2901

/*

2910

/*

2902

* During the creation a new cache, we need to disable our accounting mechanism

2911

* During the creation a new cache, we need to disable our accounting mechanism

2903

* altogether. This is true even if we are not creating, but rather just

2912

* altogether. This is true even if we are not creating, but rather just

2904

* enqueing new caches to be created.

2913

* enqueing new caches to be created.

2905

*

2914

*

2906

* This is because that process will trigger allocations; some visible, like

2915

* This is because that process will trigger allocations; some visible, like

2907

* explicit kmallocs to auxiliary data structures, name strings and internal

2916

* explicit kmallocs to auxiliary data structures, name strings and internal

2908

* cache structures; some well concealed, like INIT_WORK() that can allocate

2917

* cache structures; some well concealed, like INIT_WORK() that can allocate

2909

* objects during debug.

2918

* objects during debug.

2910

*

2919

*

2911

* If any allocation happens during memcg_kmem_get_cache, we will recurse back

2920

* If any allocation happens during memcg_kmem_get_cache, we will recurse back

2912

* to it. This may not be a bounded recursion: since the first cache creation

2921

* to it. This may not be a bounded recursion: since the first cache creation

2913

* failed to complete (waiting on the allocation), we'll just try to create the

2922

* failed to complete (waiting on the allocation), we'll just try to create the

2914

* cache again, failing at the same point.

2923

* cache again, failing at the same point.

2915

*

2924

*

2916

* memcg_kmem_get_cache is prepared to abort after seeing a positive count of

2925

* memcg_kmem_get_cache is prepared to abort after seeing a positive count of

2917

* memcg_kmem_skip_account. So we enclose anything that might allocate memory

2926

* memcg_kmem_skip_account. So we enclose anything that might allocate memory

2918

* inside the following two functions.

2927

* inside the following two functions.

2919

*/

2928

*/

2920

static inline void memcg_stop_kmem_account(void)

2929

static inline void memcg_stop_kmem_account(void)

2921

{

2930

{

2922

VM_BUG_ON(!current->mm);

2931

VM_BUG_ON(!current->mm);

2923

current->memcg_kmem_skip_account++;

2932

current->memcg_kmem_skip_account++;

2924

}

2933

}

2925

2934

2926

static inline void memcg_resume_kmem_account(void)

2935

static inline void memcg_resume_kmem_account(void)

2927

{

2936

{

2928

VM_BUG_ON(!current->mm);

2937

VM_BUG_ON(!current->mm);

2929

current->memcg_kmem_skip_account--;

2938

current->memcg_kmem_skip_account--;

2930

}

2939

}

2931

2940

2932

int __memcg_cleanup_cache_params(struct kmem_cache *s)

2941

int __memcg_cleanup_cache_params(struct kmem_cache *s)

2933

{

2942

{

2934

struct kmem_cache *c;

2943

struct kmem_cache *c;

2935

int i, failed = 0;

2944

int i, failed = 0;

2936

2945

2937

mutex_lock(&memcg_slab_mutex);

2946

mutex_lock(&memcg_slab_mutex);

2938

for_each_memcg_cache_index(i) {

2947

for_each_memcg_cache_index(i) {

2939

c = cache_from_memcg_idx(s, i);

2948

c = cache_from_memcg_idx(s, i);

2940

if (!c)

2949

if (!c)

2941

continue;

2950

continue;

2942

2951

2943

memcg_unregister_cache(c);

2952

memcg_unregister_cache(c);

2944

2953

2945

if (cache_from_memcg_idx(s, i))

2954

if (cache_from_memcg_idx(s, i))

2946

failed++;

2955

failed++;

2947

}

2956

}

2948

mutex_unlock(&memcg_slab_mutex);

2957

mutex_unlock(&memcg_slab_mutex);

2949

return failed;

2958

return failed;

2950

}

2959

}

2951

2960

2952

static void memcg_unregister_all_caches(struct mem_cgroup *memcg)

2961

static void memcg_unregister_all_caches(struct mem_cgroup *memcg)

2953

{

2962

{

2954

struct kmem_cache *cachep;

2963

struct kmem_cache *cachep;

2955

struct memcg_cache_params *params, *tmp;

2964

struct memcg_cache_params *params, *tmp;

2956

2965

2957

if (!memcg_kmem_is_active(memcg))

2966

if (!memcg_kmem_is_active(memcg))

2958

return;

2967

return;

2959

2968

2960

mutex_lock(&memcg_slab_mutex);

2969

mutex_lock(&memcg_slab_mutex);

2961

list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {

2970

list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {

2962

cachep = memcg_params_to_cache(params);

2971

cachep = memcg_params_to_cache(params);

2963

kmem_cache_shrink(cachep);

2972

kmem_cache_shrink(cachep);

2964

if (atomic_read(&cachep->memcg_params->nr_pages) == 0)

2973

if (atomic_read(&cachep->memcg_params->nr_pages) == 0)

2965

memcg_unregister_cache(cachep);

2974

memcg_unregister_cache(cachep);

2966

}

2975

}

2967

mutex_unlock(&memcg_slab_mutex);

2976

mutex_unlock(&memcg_slab_mutex);

2968

}

2977

}

2969

2978

2970

struct memcg_register_cache_work {

2979

struct memcg_register_cache_work {

2971

struct mem_cgroup *memcg;

2980

struct mem_cgroup *memcg;

2972

struct kmem_cache *cachep;

2981

struct kmem_cache *cachep;

2973

struct work_struct work;

2982

struct work_struct work;

2974

};

2983

};

2975

2984

2976

static void memcg_register_cache_func(struct work_struct *w)

2985

static void memcg_register_cache_func(struct work_struct *w)

2977

{

2986

{

2978

struct memcg_register_cache_work *cw =

2987

struct memcg_register_cache_work *cw =

2979

container_of(w, struct memcg_register_cache_work, work);

2988

container_of(w, struct memcg_register_cache_work, work);

2980

struct mem_cgroup *memcg = cw->memcg;

2989

struct mem_cgroup *memcg = cw->memcg;

2981

struct kmem_cache *cachep = cw->cachep;

2990

struct kmem_cache *cachep = cw->cachep;

2982

2991

2983

mutex_lock(&memcg_slab_mutex);

2992

mutex_lock(&memcg_slab_mutex);

2984

memcg_register_cache(memcg, cachep);

2993

memcg_register_cache(memcg, cachep);

2985

mutex_unlock(&memcg_slab_mutex);

2994

mutex_unlock(&memcg_slab_mutex);

2986

2995

2987

css_put(&memcg->css);

2996

css_put(&memcg->css);

2988

kfree(cw);

2997

kfree(cw);

2989

}

2998

}

2990

2999

2991

/*

3000

/*

2992

* Enqueue the creation of a per-memcg kmem_cache.

3001

* Enqueue the creation of a per-memcg kmem_cache.

2993

*/

3002

*/

2994

static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,

3003

static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,

2995

struct kmem_cache *cachep)

3004

struct kmem_cache *cachep)

2996

{

3005

{

2997

struct memcg_register_cache_work *cw;

3006

struct memcg_register_cache_work *cw;

2998

3007

2999

cw = kmalloc(sizeof(*cw), GFP_NOWAIT);

3008

cw = kmalloc(sizeof(*cw), GFP_NOWAIT);

3000

if (cw == NULL) {

3009

if (cw == NULL) {

3001

css_put(&memcg->css);

3010

css_put(&memcg->css);

3002

return;

3011

return;

3003

}

3012

}

3004

3013

3005

cw->memcg = memcg;

3014

cw->memcg = memcg;

3006

cw->cachep = cachep;

3015

cw->cachep = cachep;

3007

3016

3008

INIT_WORK(&cw->work, memcg_register_cache_func);

3017

INIT_WORK(&cw->work, memcg_register_cache_func);

3009

schedule_work(&cw->work);

3018

schedule_work(&cw->work);

3010

}

3019

}

3011

3020

3012

static void memcg_schedule_register_cache(struct mem_cgroup *memcg,

3021

static void memcg_schedule_register_cache(struct mem_cgroup *memcg,

3013

struct kmem_cache *cachep)

3022

struct kmem_cache *cachep)

3014

{

3023

{

3015

/*

3024

/*

3016

* We need to stop accounting when we kmalloc, because if the

3025

* We need to stop accounting when we kmalloc, because if the

3017

* corresponding kmalloc cache is not yet created, the first allocation

3026

* corresponding kmalloc cache is not yet created, the first allocation

3018

* in __memcg_schedule_register_cache will recurse.

3027

* in __memcg_schedule_register_cache will recurse.

3019

*

3028

*

3020

* However, it is better to enclose the whole function. Depending on

3029

* However, it is better to enclose the whole function. Depending on

3021

* the debugging options enabled, INIT_WORK(), for instance, can

3030

* the debugging options enabled, INIT_WORK(), for instance, can

3022

* trigger an allocation. This too, will make us recurse. Because at

3031

* trigger an allocation. This too, will make us recurse. Because at

3023

* this point we can't allow ourselves back into memcg_kmem_get_cache,

3032

* this point we can't allow ourselves back into memcg_kmem_get_cache,

3024

* the safest choice is to do it like this, wrapping the whole function.

3033

* the safest choice is to do it like this, wrapping the whole function.

3025

*/

3034

*/

3026

memcg_stop_kmem_account();

3035

memcg_stop_kmem_account();

3027

__memcg_schedule_register_cache(memcg, cachep);

3036

__memcg_schedule_register_cache(memcg, cachep);

3028

memcg_resume_kmem_account();

3037

memcg_resume_kmem_account();

3029

}

3038

}

3030

3039

3031

int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)

3040

int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)

3032

{

3041

{

3033

unsigned int nr_pages = 1 << order;

3042

unsigned int nr_pages = 1 << order;

3034

int res;

3043

int res;

3035

3044

3036

res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);

3045

res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);

3037

if (!res)

3046

if (!res)

3038

atomic_add(nr_pages, &cachep->memcg_params->nr_pages);

3047

atomic_add(nr_pages, &cachep->memcg_params->nr_pages);

3039

return res;

3048

return res;

3040

}

3049

}

3041

3050

3042

void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)

3051

void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)

3043

{

3052

{

3044

unsigned int nr_pages = 1 << order;

3053

unsigned int nr_pages = 1 << order;

3045

3054

3046

memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);

3055

memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);

3047

atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);

3056

atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);

3048

}

3057

}

3049

3058

3050

/*

3059

/*

3051

* Return the kmem_cache we're supposed to use for a slab allocation.

3060

* Return the kmem_cache we're supposed to use for a slab allocation.

3052

* We try to use the current memcg's version of the cache.

3061

* We try to use the current memcg's version of the cache.

3053

*

3062

*

3054

* If the cache does not exist yet, if we are the first user of it,

3063

* If the cache does not exist yet, if we are the first user of it,

3055

* we either create it immediately, if possible, or create it asynchronously

3064

* we either create it immediately, if possible, or create it asynchronously

3056

* in a workqueue.

3065

* in a workqueue.

3057

* In the latter case, we will let the current allocation go through with

3066

* In the latter case, we will let the current allocation go through with

3058

* the original cache.

3067

* the original cache.

3059

*

3068

*

3060

* Can't be called in interrupt context or from kernel threads.

3069

* Can't be called in interrupt context or from kernel threads.

3061

* This function needs to be called with rcu_read_lock() held.

3070

* This function needs to be called with rcu_read_lock() held.

3062

*/

3071

*/

3063

struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,

3072

struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,

3064

gfp_t gfp)

3073

gfp_t gfp)

3065

{

3074

{

3066

struct mem_cgroup *memcg;

3075

struct mem_cgroup *memcg;

3067

struct kmem_cache *memcg_cachep;

3076

struct kmem_cache *memcg_cachep;

3068

3077

3069

VM_BUG_ON(!cachep->memcg_params);

3078

VM_BUG_ON(!cachep->memcg_params);

3070

VM_BUG_ON(!cachep->memcg_params->is_root_cache);

3079

VM_BUG_ON(!cachep->memcg_params->is_root_cache);

3071

3080

3072

if (!current->mm || current->memcg_kmem_skip_account)

3081

if (!current->mm || current->memcg_kmem_skip_account)

3073

return cachep;

3082

return cachep;

3074

3083

3075

rcu_read_lock();

3084

rcu_read_lock();

3076

memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));

3085

memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));

3077

3086

3078

if (!memcg_kmem_is_active(memcg))

3087

if (!memcg_kmem_is_active(memcg))

3079

goto out;

3088

goto out;

3080

3089

3081

memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));

3090

memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));

3082

if (likely(memcg_cachep)) {

3091

if (likely(memcg_cachep)) {

3083

cachep = memcg_cachep;

3092

cachep = memcg_cachep;

3084

goto out;

3093

goto out;

3085

}

3094

}

3086

3095

3087

/* The corresponding put will be done in the workqueue. */

3096

/* The corresponding put will be done in the workqueue. */

3088

if (!css_tryget_online(&memcg->css))

3097

if (!css_tryget_online(&memcg->css))

3089

goto out;

3098

goto out;

3090

rcu_read_unlock();

3099

rcu_read_unlock();

3091

3100

3092

/*

3101

/*

3093

* If we are in a safe context (can wait, and not in interrupt

3102

* If we are in a safe context (can wait, and not in interrupt

3094

* context), we could be be predictable and return right away.

3103

* context), we could be be predictable and return right away.

3095

* This would guarantee that the allocation being performed

3104

* This would guarantee that the allocation being performed

3096

* already belongs in the new cache.

3105

* already belongs in the new cache.

3097

*

3106

*

3098

* However, there are some clashes that can arrive from locking.

3107

* However, there are some clashes that can arrive from locking.

3099

* For instance, because we acquire the slab_mutex while doing

3108

* For instance, because we acquire the slab_mutex while doing

3100

* memcg_create_kmem_cache, this means no further allocation

3109

* memcg_create_kmem_cache, this means no further allocation

3101

* could happen with the slab_mutex held. So it's better to

3110

* could happen with the slab_mutex held. So it's better to

3102

* defer everything.

3111

* defer everything.

3103

*/

3112

*/

3104

memcg_schedule_register_cache(memcg, cachep);

3113

memcg_schedule_register_cache(memcg, cachep);

3105

return cachep;

3114

return cachep;

3106

out:

3115

out:

3107

rcu_read_unlock();

3116

rcu_read_unlock();

3108

return cachep;

3117

return cachep;

3109

}

3118

}

3110

3119

3111

/*

3120

/*

3112

* We need to verify if the allocation against current->mm->owner's memcg is

3121

* We need to verify if the allocation against current->mm->owner's memcg is

3113

* possible for the given order. But the page is not allocated yet, so we'll

3122

* possible for the given order. But the page is not allocated yet, so we'll

3114

* need a further commit step to do the final arrangements.

3123

* need a further commit step to do the final arrangements.

3115

*

3124

*

3116

* It is possible for the task to switch cgroups in this mean time, so at

3125

* It is possible for the task to switch cgroups in this mean time, so at

3117

* commit time, we can't rely on task conversion any longer. We'll then use

3126

* commit time, we can't rely on task conversion any longer. We'll then use

3118

* the handle argument to return to the caller which cgroup we should commit

3127

* the handle argument to return to the caller which cgroup we should commit

3119

* against. We could also return the memcg directly and avoid the pointer

3128

* against. We could also return the memcg directly and avoid the pointer

3120

* passing, but a boolean return value gives better semantics considering

3129

* passing, but a boolean return value gives better semantics considering

3121

* the compiled-out case as well.

3130

* the compiled-out case as well.

3122

*

3131

*

3123

* Returning true means the allocation is possible.

3132

* Returning true means the allocation is possible.

3124

*/

3133

*/

3125

bool

3134

bool

3126

__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)

3135

__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)

3127

{

3136

{

3128

struct mem_cgroup *memcg;

3137

struct mem_cgroup *memcg;

3129

int ret;

3138

int ret;

3130

3139

3131

*_memcg = NULL;

3140

*_memcg = NULL;

3132

3141

3133

/*

3142

/*

3134

* Disabling accounting is only relevant for some specific memcg

3143

* Disabling accounting is only relevant for some specific memcg

3135

* internal allocations. Therefore we would initially not have such

3144

* internal allocations. Therefore we would initially not have such

3136

* check here, since direct calls to the page allocator that are

3145

* check here, since direct calls to the page allocator that are

3137

* accounted to kmemcg (alloc_kmem_pages and friends) only happen

3146

* accounted to kmemcg (alloc_kmem_pages and friends) only happen

3138

* outside memcg core. We are mostly concerned with cache allocations,

3147

* outside memcg core. We are mostly concerned with cache allocations,

3139

* and by having this test at memcg_kmem_get_cache, we are already able

3148

* and by having this test at memcg_kmem_get_cache, we are already able

3140

* to relay the allocation to the root cache and bypass the memcg cache

3149

* to relay the allocation to the root cache and bypass the memcg cache

3141

* altogether.

3150

* altogether.

3142

*

3151

*

3143

* There is one exception, though: the SLUB allocator does not create

3152

* There is one exception, though: the SLUB allocator does not create

3144

* large order caches, but rather service large kmallocs directly from

3153

* large order caches, but rather service large kmallocs directly from

3145

* the page allocator. Therefore, the following sequence when backed by

3154

* the page allocator. Therefore, the following sequence when backed by

3146

* the SLUB allocator:

3155

* the SLUB allocator:

3147

*

3156

*

3148

* memcg_stop_kmem_account();

3157

* memcg_stop_kmem_account();

3149

* kmalloc(<large_number>)

3158

* kmalloc(<large_number>)

3150

* memcg_resume_kmem_account();

3159

* memcg_resume_kmem_account();

3151

*

3160

*

3152

* would effectively ignore the fact that we should skip accounting,

3161

* would effectively ignore the fact that we should skip accounting,

3153

* since it will drive us directly to this function without passing

3162

* since it will drive us directly to this function without passing

3154

* through the cache selector memcg_kmem_get_cache. Such large

3163

* through the cache selector memcg_kmem_get_cache. Such large

3155

* allocations are extremely rare but can happen, for instance, for the

3164

* allocations are extremely rare but can happen, for instance, for the

3156

* cache arrays. We bring this test here.

3165

* cache arrays. We bring this test here.

3157

*/

3166

*/

3158

if (!current->mm || current->memcg_kmem_skip_account)

3167

if (!current->mm || current->memcg_kmem_skip_account)

3159

return true;

3168

return true;

3160

3169

3161

memcg = get_mem_cgroup_from_mm(current->mm);

3170

memcg = get_mem_cgroup_from_mm(current->mm);

3162

3171

3163

if (!memcg_kmem_is_active(memcg)) {

3172

if (!memcg_kmem_is_active(memcg)) {

3164

css_put(&memcg->css);

3173

css_put(&memcg->css);

3165

return true;

3174

return true;

3166

}

3175

}

3167

3176

3168

ret = memcg_charge_kmem(memcg, gfp, 1 << order);

3177

ret = memcg_charge_kmem(memcg, gfp, 1 << order);

3169

if (!ret)

3178

if (!ret)

3170

*_memcg = memcg;

3179

*_memcg = memcg;

3171

3180

3172

css_put(&memcg->css);

3181

css_put(&memcg->css);

3173

return (ret == 0);

3182

return (ret == 0);

3174

}

3183

}

3175

3184

3176

void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,

3185

void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,

3177

int order)

3186

int order)

3178

{

3187

{

3179

struct page_cgroup *pc;

3188

struct page_cgroup *pc;

3180

3189

3181

VM_BUG_ON(mem_cgroup_is_root(memcg));

3190

VM_BUG_ON(mem_cgroup_is_root(memcg));

3182

3191

3183

/* The page allocation failed. Revert */

3192

/* The page allocation failed. Revert */

3184

if (!page) {

3193

if (!page) {

3185

memcg_uncharge_kmem(memcg, 1 << order);

3194

memcg_uncharge_kmem(memcg, 1 << order);

3186

return;

3195

return;

3187

}

3196

}

3188

/*

3197

/*

3189

* The page is freshly allocated and not visible to any

3198

* The page is freshly allocated and not visible to any

3190

* outside callers yet. Set up pc non-atomically.

3199

* outside callers yet. Set up pc non-atomically.

3191

*/

3200

*/

3192

pc = lookup_page_cgroup(page);

3201

pc = lookup_page_cgroup(page);

3193

pc->mem_cgroup = memcg;

3202

pc->mem_cgroup = memcg;

3194

pc->flags = PCG_USED;

3203

pc->flags = PCG_USED;

3195

}

3204

}

3196

3205

3197

void __memcg_kmem_uncharge_pages(struct page *page, int order)

3206

void __memcg_kmem_uncharge_pages(struct page *page, int order)

3198

{

3207

{

3199

struct mem_cgroup *memcg = NULL;

3208

struct mem_cgroup *memcg = NULL;

3200

struct page_cgroup *pc;

3209

struct page_cgroup *pc;

3201

3210

3202

3211

3203

pc = lookup_page_cgroup(page);

3212

pc = lookup_page_cgroup(page);

3204

if (!PageCgroupUsed(pc))

3213

if (!PageCgroupUsed(pc))

3205

return;

3214

return;

3206

3215

3207

memcg = pc->mem_cgroup;

3216

memcg = pc->mem_cgroup;

3208

pc->flags = 0;

3217

pc->flags = 0;

3209

3218

3210

/*

3219

/*

3211

* We trust that only if there is a memcg associated with the page, it

3220

* We trust that only if there is a memcg associated with the page, it

3212

* is a valid allocation

3221

* is a valid allocation

3213

*/

3222

*/

3214

if (!memcg)

3223

if (!memcg)

3215

return;

3224

return;

3216

3225

3217

VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);

3226

VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);

3218

memcg_uncharge_kmem(memcg, 1 << order);

3227

memcg_uncharge_kmem(memcg, 1 << order);

3219

}

3228

}

3220

#else

3229

#else

3221

static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)

3230

static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)

3222

{

3231

{

3223

}

3232

}

3224

#endif /* CONFIG_MEMCG_KMEM */

3233

#endif /* CONFIG_MEMCG_KMEM */

3225

3234

3226

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

3235

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

3227

3236

3228

/*

3237

/*

3229

* Because tail pages are not marked as "used", set it. We're under

3238

* Because tail pages are not marked as "used", set it. We're under

3230

* zone->lru_lock, 'splitting on pmd' and compound_lock.

3239

* zone->lru_lock, 'splitting on pmd' and compound_lock.

3231

* charge/uncharge will be never happen and move_account() is done under

3240

* charge/uncharge will be never happen and move_account() is done under

3232

* compound_lock(), so we don't have to take care of races.

3241

* compound_lock(), so we don't have to take care of races.

3233

*/

3242

*/

3234

void mem_cgroup_split_huge_fixup(struct page *head)

3243

void mem_cgroup_split_huge_fixup(struct page *head)

3235

{

3244

{

3236

struct page_cgroup *head_pc = lookup_page_cgroup(head);

3245

struct page_cgroup *head_pc = lookup_page_cgroup(head);

3237

struct page_cgroup *pc;

3246

struct page_cgroup *pc;

3238

struct mem_cgroup *memcg;

3247

struct mem_cgroup *memcg;

3239

int i;

3248

int i;

3240

3249

3241

if (mem_cgroup_disabled())

3250

if (mem_cgroup_disabled())

3242

return;

3251

return;

3243

3252

3244

memcg = head_pc->mem_cgroup;

3253

memcg = head_pc->mem_cgroup;

3245

for (i = 1; i < HPAGE_PMD_NR; i++) {

3254

for (i = 1; i < HPAGE_PMD_NR; i++) {

3246

pc = head_pc + i;

3255

pc = head_pc + i;

3247

pc->mem_cgroup = memcg;

3256

pc->mem_cgroup = memcg;

3248

pc->flags = head_pc->flags;

3257

pc->flags = head_pc->flags;

3249

}

3258

}

3250

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

3259

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],

3251

HPAGE_PMD_NR);

3260

HPAGE_PMD_NR);

3252

}

3261

}

3253

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

3262

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

3254

3263

3255

/**

3264

/**

3256

* mem_cgroup_move_account - move account of the page

3265

* mem_cgroup_move_account - move account of the page

3257

* @page: the page

3266

* @page: the page

3258

* @nr_pages: number of regular pages (>1 for huge pages)

3267

* @nr_pages: number of regular pages (>1 for huge pages)

3259

* @pc: page_cgroup of the page.

3268

* @pc: page_cgroup of the page.

3260

* @from: mem_cgroup which the page is moved from.

3269

* @from: mem_cgroup which the page is moved from.

3261

* @to: mem_cgroup which the page is moved to. @from != @to.

3270

* @to: mem_cgroup which the page is moved to. @from != @to.

3262

*

3271

*

3263

* The caller must confirm following.

3272

* The caller must confirm following.

3264

* - page is not on LRU (isolate_page() is useful.)

3273

* - page is not on LRU (isolate_page() is useful.)

3265

* - compound_lock is held when nr_pages > 1

3274

* - compound_lock is held when nr_pages > 1

3266

*

3275

*

3267

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

3276

* This function doesn't do "charge" to new cgroup and doesn't do "uncharge"

3268

* from old cgroup.

3277

* from old cgroup.

3269

*/

3278

*/

3270

static int mem_cgroup_move_account(struct page *page,

3279

static int mem_cgroup_move_account(struct page *page,

3271

unsigned int nr_pages,

3280

unsigned int nr_pages,

3272

struct page_cgroup *pc,

3281

struct page_cgroup *pc,

3273

struct mem_cgroup *from,

3282

struct mem_cgroup *from,

3274

struct mem_cgroup *to)

3283

struct mem_cgroup *to)

3275

{

3284

{

3276

unsigned long flags;

3285

unsigned long flags;

3277

int ret;

3286

int ret;

3278

3287

3279

VM_BUG_ON(from == to);

3288

VM_BUG_ON(from == to);

3280

VM_BUG_ON_PAGE(PageLRU(page), page);

3289

VM_BUG_ON_PAGE(PageLRU(page), page);

3281

/*

3290

/*

3282

* The page is isolated from LRU. So, collapse function

3291

* The page is isolated from LRU. So, collapse function

3283

* will not handle this page. But page splitting can happen.

3292

* will not handle this page. But page splitting can happen.

3284

* Do this check under compound_page_lock(). The caller should

3293

* Do this check under compound_page_lock(). The caller should

3285

* hold it.

3294

* hold it.

3286

*/

3295

*/

3287

ret = -EBUSY;

3296

ret = -EBUSY;

3288

if (nr_pages > 1 && !PageTransHuge(page))

3297

if (nr_pages > 1 && !PageTransHuge(page))

3289

goto out;

3298

goto out;

3290

3299

3291

/*

3300

/*

3292

* Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup

3301

* Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup

3293

* of its source page while we change it: page migration takes

3302

* of its source page while we change it: page migration takes

3294

* both pages off the LRU, but page cache replacement doesn't.

3303

* both pages off the LRU, but page cache replacement doesn't.

3295

*/

3304

*/

3296

if (!trylock_page(page))

3305

if (!trylock_page(page))

3297

goto out;

3306

goto out;

3298

3307

3299

ret = -EINVAL;

3308

ret = -EINVAL;

3300

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

3309

if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)

3301

goto out_unlock;

3310

goto out_unlock;

3302

3311

3303

move_lock_mem_cgroup(from, &flags);

3312

move_lock_mem_cgroup(from, &flags);

3304

3313

3305

if (!PageAnon(page) && page_mapped(page)) {

3314

if (!PageAnon(page) && page_mapped(page)) {

3306

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3315

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3307

nr_pages);

3316

nr_pages);

3308

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3317

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],

3309

nr_pages);

3318

nr_pages);

3310

}

3319

}

3311

3320

3312

if (PageWriteback(page)) {

3321

if (PageWriteback(page)) {

3313

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3322

__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3314

nr_pages);

3323

nr_pages);

3315

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3324

__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],

3316

nr_pages);

3325

nr_pages);

3317

}

3326

}

3318

3327

3319

/*

3328

/*

3320

* It is safe to change pc->mem_cgroup here because the page

3329

* It is safe to change pc->mem_cgroup here because the page

3321

* is referenced, charged, and isolated - we can't race with

3330

* is referenced, charged, and isolated - we can't race with

3322

* uncharging, charging, migration, or LRU putback.

3331

* uncharging, charging, migration, or LRU putback.

3323

*/

3332

*/

3324

3333

3325

/* caller should have done css_get */

3334

/* caller should have done css_get */

3326

pc->mem_cgroup = to;

3335

pc->mem_cgroup = to;

3327

move_unlock_mem_cgroup(from, &flags);

3336

move_unlock_mem_cgroup(from, &flags);

3328

ret = 0;

3337

ret = 0;

3329

3338

3330

local_irq_disable();

3339

local_irq_disable();

3331

mem_cgroup_charge_statistics(to, page, nr_pages);

3340

mem_cgroup_charge_statistics(to, page, nr_pages);

3332

memcg_check_events(to, page);

3341

memcg_check_events(to, page);

3333

mem_cgroup_charge_statistics(from, page, -nr_pages);

3342

mem_cgroup_charge_statistics(from, page, -nr_pages);

3334

memcg_check_events(from, page);

3343

memcg_check_events(from, page);

3335

local_irq_enable();

3344

local_irq_enable();

3336

out_unlock:

3345

out_unlock:

3337

unlock_page(page);

3346

unlock_page(page);

3338

out:

3347

out:

3339

return ret;

3348

return ret;

3340

}

3349

}

3341

3350

3342

/**

3351

/**

3343

* mem_cgroup_move_parent - moves page to the parent group

3352

* mem_cgroup_move_parent - moves page to the parent group

3344

* @page: the page to move

3353

* @page: the page to move

3345

* @pc: page_cgroup of the page

3354

* @pc: page_cgroup of the page

3346

* @child: page's cgroup

3355

* @child: page's cgroup

3347

*

3356

*

3348

* move charges to its parent or the root cgroup if the group has no

3357

* move charges to its parent or the root cgroup if the group has no

3349

* parent (aka use_hierarchy==0).

3358

* parent (aka use_hierarchy==0).

3350

* Although this might fail (get_page_unless_zero, isolate_lru_page or

3359

* Although this might fail (get_page_unless_zero, isolate_lru_page or

3351

* mem_cgroup_move_account fails) the failure is always temporary and

3360

* mem_cgroup_move_account fails) the failure is always temporary and

3352

* it signals a race with a page removal/uncharge or migration. In the

3361

* it signals a race with a page removal/uncharge or migration. In the

3353

* first case the page is on the way out and it will vanish from the LRU

3362

* first case the page is on the way out and it will vanish from the LRU

3354

* on the next attempt and the call should be retried later.

3363

* on the next attempt and the call should be retried later.

3355

* Isolation from the LRU fails only if page has been isolated from

3364

* Isolation from the LRU fails only if page has been isolated from

3356

* the LRU since we looked at it and that usually means either global

3365

* the LRU since we looked at it and that usually means either global

3357

* reclaim or migration going on. The page will either get back to the

3366

* reclaim or migration going on. The page will either get back to the

3358

* LRU or vanish.

3367

* LRU or vanish.

3359

* Finaly mem_cgroup_move_account fails only if the page got uncharged

3368

* Finaly mem_cgroup_move_account fails only if the page got uncharged

3360

* (!PageCgroupUsed) or moved to a different group. The page will

3369

* (!PageCgroupUsed) or moved to a different group. The page will

3361

* disappear in the next attempt.

3370

* disappear in the next attempt.

3362

*/

3371

*/

3363

static int mem_cgroup_move_parent(struct page *page,

3372

static int mem_cgroup_move_parent(struct page *page,

3364

struct page_cgroup *pc,

3373

struct page_cgroup *pc,

3365

struct mem_cgroup *child)

3374

struct mem_cgroup *child)

3366

{

3375

{

3367

struct mem_cgroup *parent;

3376

struct mem_cgroup *parent;

3368

unsigned int nr_pages;

3377

unsigned int nr_pages;

3369

unsigned long uninitialized_var(flags);

3378

unsigned long uninitialized_var(flags);

3370

int ret;

3379

int ret;

3371

3380

3372

VM_BUG_ON(mem_cgroup_is_root(child));

3381

VM_BUG_ON(mem_cgroup_is_root(child));

3373

3382

3374

ret = -EBUSY;

3383

ret = -EBUSY;

3375

if (!get_page_unless_zero(page))

3384

if (!get_page_unless_zero(page))

3376

goto out;

3385

goto out;

3377

if (isolate_lru_page(page))

3386

if (isolate_lru_page(page))

3378

goto put;

3387

goto put;

3379

3388

3380

nr_pages = hpage_nr_pages(page);

3389

nr_pages = hpage_nr_pages(page);

3381

3390

3382

parent = parent_mem_cgroup(child);

3391

parent = parent_mem_cgroup(child);

3383

/*

3392

/*

3384

* If no parent, move charges to root cgroup.

3393

* If no parent, move charges to root cgroup.

3385

*/

3394

*/

3386

if (!parent)

3395

if (!parent)

3387

parent = root_mem_cgroup;

3396

parent = root_mem_cgroup;

3388

3397

3389

if (nr_pages > 1) {

3398

if (nr_pages > 1) {

3390

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

3399

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

3391

flags = compound_lock_irqsave(page);

3400

flags = compound_lock_irqsave(page);

3392

}

3401

}

3393

3402

3394

ret = mem_cgroup_move_account(page, nr_pages,

3403

ret = mem_cgroup_move_account(page, nr_pages,

3395

pc, child, parent);

3404

pc, child, parent);

3396

if (!ret) {

3405

if (!ret) {

3406

if (!mem_cgroup_is_root(parent))

3407

css_get_many(&parent->css, nr_pages);

3397

/* Take charge off the local counters */

3408

/* Take charge off the local counters */

3398

page_counter_cancel(&child->memory, nr_pages);

3409

page_counter_cancel(&child->memory, nr_pages);

3399

if (do_swap_account)

3410

if (do_swap_account)

3400

page_counter_cancel(&child->memsw, nr_pages);

3411

page_counter_cancel(&child->memsw, nr_pages);

3412

css_put_many(&child->css, nr_pages);

3401

}

3413

}

3402

3414

3403

if (nr_pages > 1)

3415

if (nr_pages > 1)

3404

compound_unlock_irqrestore(page, flags);

3416

compound_unlock_irqrestore(page, flags);

3405

putback_lru_page(page);

3417

putback_lru_page(page);

3406

put:

3418

put:

3407

put_page(page);

3419

put_page(page);

3408

out:

3420

out:

3409

return ret;

3421

return ret;

3410

}

3422

}

3411

3423

3412

#ifdef CONFIG_MEMCG_SWAP

3424

#ifdef CONFIG_MEMCG_SWAP

3413

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

3425

static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,

3414

bool charge)

3426

bool charge)

3415

{

3427

{

3416

int val = (charge) ? 1 : -1;

3428

int val = (charge) ? 1 : -1;

3417

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

3429

this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);

3418

}

3430

}

3419

3431

3420

/**

3432

/**

3421

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3433

* mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.

3422

* @entry: swap entry to be moved

3434

* @entry: swap entry to be moved

3423

* @from: mem_cgroup which the entry is moved from

3435

* @from: mem_cgroup which the entry is moved from

3424

* @to: mem_cgroup which the entry is moved to

3436

* @to: mem_cgroup which the entry is moved to

3425

*

3437

*

3426

* It succeeds only when the swap_cgroup's record for this entry is the same

3438

* It succeeds only when the swap_cgroup's record for this entry is the same

3427

* as the mem_cgroup's id of @from.

3439

* as the mem_cgroup's id of @from.

3428

*

3440

*

3429

* Returns 0 on success, -EINVAL on failure.

3441

* Returns 0 on success, -EINVAL on failure.

3430

*

3442

*

3431

* The caller must have charged to @to, IOW, called page_counter_charge() about

3443

* The caller must have charged to @to, IOW, called page_counter_charge() about

3432

* both res and memsw, and called css_get().

3444

* both res and memsw, and called css_get().

3433

*/

3445

*/

3434

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3446

static int mem_cgroup_move_swap_account(swp_entry_t entry,

3435

struct mem_cgroup *from, struct mem_cgroup *to)

3447

struct mem_cgroup *from, struct mem_cgroup *to)

3436

{

3448

{

3437

unsigned short old_id, new_id;

3449

unsigned short old_id, new_id;

3438

3450

3439

old_id = mem_cgroup_id(from);

3451

old_id = mem_cgroup_id(from);

3440

new_id = mem_cgroup_id(to);

3452

new_id = mem_cgroup_id(to);

3441

3453

3442

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3454

if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {

3443

mem_cgroup_swap_statistics(from, false);

3455

mem_cgroup_swap_statistics(from, false);

3444

mem_cgroup_swap_statistics(to, true);

3456

mem_cgroup_swap_statistics(to, true);

3445

/*

3457

/*

3446

* This function is only called from task migration context now.

3458

* This function is only called from task migration context now.

3447

* It postpones page_counter and refcount handling till the end

3459

* It postpones page_counter and refcount handling till the end

3448

* of task migration(mem_cgroup_clear_mc()) for performance

3460

* of task migration(mem_cgroup_clear_mc()) for performance

3449

* improvement. But we cannot postpone css_get(to) because if

3461

* improvement. But we cannot postpone css_get(to) because if

3450

* the process that has been moved to @to does swap-in, the

3462

* the process that has been moved to @to does swap-in, the

3451

* refcount of @to might be decreased to 0.

3463

* refcount of @to might be decreased to 0.

3452

*

3464

*

3453

* We are in attach() phase, so the cgroup is guaranteed to be

3465

* We are in attach() phase, so the cgroup is guaranteed to be

3454

* alive, so we can just call css_get().

3466

* alive, so we can just call css_get().

3455

*/

3467

*/

3456

css_get(&to->css);

3468

css_get(&to->css);

3457

return 0;

3469

return 0;

3458

}

3470

}

3459

return -EINVAL;

3471

return -EINVAL;

3460

}

3472

}

3461

#else

3473

#else

3462

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3474

static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

3463

struct mem_cgroup *from, struct mem_cgroup *to)

3475

struct mem_cgroup *from, struct mem_cgroup *to)

3464

{

3476

{

3465

return -EINVAL;

3477

return -EINVAL;

3466

}

3478

}

3467

#endif

3479

#endif

3468

3480

3469

#ifdef CONFIG_DEBUG_VM

3481

#ifdef CONFIG_DEBUG_VM

3470

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3482

static struct page_cgroup *lookup_page_cgroup_used(struct page *page)

3471

{

3483

{

3472

struct page_cgroup *pc;

3484

struct page_cgroup *pc;

3473

3485

3474

pc = lookup_page_cgroup(page);

3486

pc = lookup_page_cgroup(page);

3475

/*

3487

/*

3476

* Can be NULL while feeding pages into the page allocator for

3488

* Can be NULL while feeding pages into the page allocator for

3477

* the first time, i.e. during boot or memory hotplug;

3489

* the first time, i.e. during boot or memory hotplug;

3478

* or when mem_cgroup_disabled().

3490

* or when mem_cgroup_disabled().

3479

*/

3491

*/

3480

if (likely(pc) && PageCgroupUsed(pc))

3492

if (likely(pc) && PageCgroupUsed(pc))

3481

return pc;

3493

return pc;

3482

return NULL;

3494

return NULL;

3483

}

3495

}

3484

3496

3485

bool mem_cgroup_bad_page_check(struct page *page)

3497

bool mem_cgroup_bad_page_check(struct page *page)

3486

{

3498

{

3487

if (mem_cgroup_disabled())

3499

if (mem_cgroup_disabled())

3488

return false;

3500

return false;

3489

3501

3490

return lookup_page_cgroup_used(page) != NULL;

3502

return lookup_page_cgroup_used(page) != NULL;

3491

}

3503

}

3492

3504

3493

void mem_cgroup_print_bad_page(struct page *page)

3505

void mem_cgroup_print_bad_page(struct page *page)

3494

{

3506

{

3495

struct page_cgroup *pc;

3507

struct page_cgroup *pc;

3496

3508

3497

pc = lookup_page_cgroup_used(page);

3509

pc = lookup_page_cgroup_used(page);

3498

if (pc) {

3510

if (pc) {

3499

pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

3511

pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",

3500

pc, pc->flags, pc->mem_cgroup);

3512

pc, pc->flags, pc->mem_cgroup);

3501

}

3513

}

3502

}

3514

}

3503

#endif

3515

#endif

3504

3516

3505

static DEFINE_MUTEX(memcg_limit_mutex);

3517

static DEFINE_MUTEX(memcg_limit_mutex);

3506

3518

3507

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3519

static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,

3508

unsigned long limit)

3520

unsigned long limit)

3509

{

3521

{

3510

unsigned long curusage;

3522

unsigned long curusage;

3511

unsigned long oldusage;

3523

unsigned long oldusage;

3512

bool enlarge = false;

3524

bool enlarge = false;

3513

int retry_count;

3525

int retry_count;

3514

int ret;

3526

int ret;

3515

3527

3516

/*

3528

/*

3517

* For keeping hierarchical_reclaim simple, how long we should retry

3529

* For keeping hierarchical_reclaim simple, how long we should retry

3518

* is depends on callers. We set our retry-count to be function

3530

* is depends on callers. We set our retry-count to be function

3519

* of # of children which we should visit in this loop.

3531

* of # of children which we should visit in this loop.

3520

*/

3532

*/

3521

retry_count = MEM_CGROUP_RECLAIM_RETRIES *

3533

retry_count = MEM_CGROUP_RECLAIM_RETRIES *

3522

mem_cgroup_count_children(memcg);

3534

mem_cgroup_count_children(memcg);

3523

3535

3524

oldusage = page_counter_read(&memcg->memory);

3536

oldusage = page_counter_read(&memcg->memory);

3525

3537

3526

do {

3538

do {

3527

if (signal_pending(current)) {

3539

if (signal_pending(current)) {

3528

ret = -EINTR;

3540

ret = -EINTR;

3529

break;

3541

break;

3530

}

3542

}

3531

3543

3532

mutex_lock(&memcg_limit_mutex);

3544

mutex_lock(&memcg_limit_mutex);

3533

if (limit > memcg->memsw.limit) {

3545

if (limit > memcg->memsw.limit) {

3534

mutex_unlock(&memcg_limit_mutex);

3546

mutex_unlock(&memcg_limit_mutex);

3535

ret = -EINVAL;

3547

ret = -EINVAL;

3536

break;

3548

break;

3537

}

3549

}

3538

if (limit > memcg->memory.limit)

3550

if (limit > memcg->memory.limit)

3539

enlarge = true;

3551

enlarge = true;

3540

ret = page_counter_limit(&memcg->memory, limit);

3552

ret = page_counter_limit(&memcg->memory, limit);

3541

mutex_unlock(&memcg_limit_mutex);

3553

mutex_unlock(&memcg_limit_mutex);

3542

3554

3543

if (!ret)

3555

if (!ret)

3544

break;

3556

break;

3545

3557

3546

try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);

3558

try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);

3547

3559

3548

curusage = page_counter_read(&memcg->memory);

3560

curusage = page_counter_read(&memcg->memory);

3549

/* Usage is reduced ? */

3561

/* Usage is reduced ? */

3550

if (curusage >= oldusage)

3562

if (curusage >= oldusage)

3551

retry_count--;

3563

retry_count--;

3552

else

3564

else

3553

oldusage = curusage;

3565

oldusage = curusage;

3554

} while (retry_count);

3566

} while (retry_count);

3555

3567

3556

if (!ret && enlarge)

3568

if (!ret && enlarge)

3557

memcg_oom_recover(memcg);

3569

memcg_oom_recover(memcg);

3558

3570

3559

return ret;

3571

return ret;

3560

}

3572

}

3561

3573

3562

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3574

static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,

3563

unsigned long limit)

3575

unsigned long limit)

3564

{

3576

{

3565

unsigned long curusage;

3577

unsigned long curusage;

3566

unsigned long oldusage;

3578

unsigned long oldusage;

3567

bool enlarge = false;

3579

bool enlarge = false;

3568

int retry_count;

3580

int retry_count;

3569

int ret;

3581

int ret;

3570

3582

3571

/* see mem_cgroup_resize_res_limit */

3583

/* see mem_cgroup_resize_res_limit */

3572

retry_count = MEM_CGROUP_RECLAIM_RETRIES *

3584

retry_count = MEM_CGROUP_RECLAIM_RETRIES *

3573

mem_cgroup_count_children(memcg);

3585

mem_cgroup_count_children(memcg);

3574

3586

3575

oldusage = page_counter_read(&memcg->memsw);

3587

oldusage = page_counter_read(&memcg->memsw);

3576

3588

3577

do {

3589

do {

3578

if (signal_pending(current)) {

3590

if (signal_pending(current)) {

3579

ret = -EINTR;

3591

ret = -EINTR;

3580

break;

3592

break;

3581

}

3593

}

3582

3594

3583

mutex_lock(&memcg_limit_mutex);

3595

mutex_lock(&memcg_limit_mutex);

3584

if (limit < memcg->memory.limit) {

3596

if (limit < memcg->memory.limit) {

3585

mutex_unlock(&memcg_limit_mutex);

3597

mutex_unlock(&memcg_limit_mutex);

3586

ret = -EINVAL;

3598

ret = -EINVAL;

3587

break;

3599

break;

3588

}

3600

}

3589

if (limit > memcg->memsw.limit)

3601

if (limit > memcg->memsw.limit)

3590

enlarge = true;

3602

enlarge = true;

3591

ret = page_counter_limit(&memcg->memsw, limit);

3603

ret = page_counter_limit(&memcg->memsw, limit);

3592

mutex_unlock(&memcg_limit_mutex);

3604

mutex_unlock(&memcg_limit_mutex);

3593

3605

3594

if (!ret)

3606

if (!ret)

3595

break;

3607

break;

3596

3608

3597

try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);

3609

try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);

3598

3610

3599

curusage = page_counter_read(&memcg->memsw);

3611

curusage = page_counter_read(&memcg->memsw);

3600

/* Usage is reduced ? */

3612

/* Usage is reduced ? */

3601

if (curusage >= oldusage)

3613

if (curusage >= oldusage)

3602

retry_count--;

3614

retry_count--;

3603

else

3615

else

3604

oldusage = curusage;

3616

oldusage = curusage;

3605

} while (retry_count);

3617

} while (retry_count);

3606

3618

3607

if (!ret && enlarge)

3619

if (!ret && enlarge)

3608

memcg_oom_recover(memcg);

3620

memcg_oom_recover(memcg);

3609

3621

3610

return ret;

3622

return ret;

3611

}

3623

}

3612

3624

3613

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3625

unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,

3614

gfp_t gfp_mask,

3626

gfp_t gfp_mask,

3615

unsigned long *total_scanned)

3627

unsigned long *total_scanned)

3616

{

3628

{

3617

unsigned long nr_reclaimed = 0;

3629

unsigned long nr_reclaimed = 0;

3618

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3630

struct mem_cgroup_per_zone *mz, *next_mz = NULL;

3619

unsigned long reclaimed;

3631

unsigned long reclaimed;

3620

int loop = 0;

3632

int loop = 0;

3621

struct mem_cgroup_tree_per_zone *mctz;

3633

struct mem_cgroup_tree_per_zone *mctz;

3622

unsigned long excess;

3634

unsigned long excess;

3623

unsigned long nr_scanned;

3635

unsigned long nr_scanned;

3624

3636

3625

if (order > 0)

3637

if (order > 0)

3626

return 0;

3638

return 0;

3627

3639

3628

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3640

mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));

3629

/*

3641

/*

3630

* This loop can run a while, specially if mem_cgroup's continuously

3642

* This loop can run a while, specially if mem_cgroup's continuously

3631

* keep exceeding their soft limit and putting the system under

3643

* keep exceeding their soft limit and putting the system under

3632

* pressure

3644

* pressure

3633

*/

3645

*/

3634

do {

3646

do {

3635

if (next_mz)

3647

if (next_mz)

3636

mz = next_mz;

3648

mz = next_mz;

3637

else

3649

else

3638

mz = mem_cgroup_largest_soft_limit_node(mctz);

3650

mz = mem_cgroup_largest_soft_limit_node(mctz);

3639

if (!mz)

3651

if (!mz)

3640

break;

3652

break;

3641

3653

3642

nr_scanned = 0;

3654

nr_scanned = 0;

3643

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

3655

reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,

3644

gfp_mask, &nr_scanned);

3656

gfp_mask, &nr_scanned);

3645

nr_reclaimed += reclaimed;

3657

nr_reclaimed += reclaimed;

3646

*total_scanned += nr_scanned;

3658

*total_scanned += nr_scanned;

3647

spin_lock_irq(&mctz->lock);

3659

spin_lock_irq(&mctz->lock);

3648

3660

3649

/*

3661

/*

3650

* If we failed to reclaim anything from this memory cgroup

3662

* If we failed to reclaim anything from this memory cgroup

3651

* it is time to move on to the next cgroup

3663

* it is time to move on to the next cgroup

3652

*/

3664

*/

3653

next_mz = NULL;

3665

next_mz = NULL;

3654

if (!reclaimed) {

3666

if (!reclaimed) {

3655

do {

3667

do {

3656

/*

3668

/*

3657

* Loop until we find yet another one.

3669

* Loop until we find yet another one.

3658

*

3670

*

3659

* By the time we get the soft_limit lock

3671

* By the time we get the soft_limit lock

3660

* again, someone might have aded the

3672

* again, someone might have aded the

3661

* group back on the RB tree. Iterate to

3673

* group back on the RB tree. Iterate to

3662

* make sure we get a different mem.

3674

* make sure we get a different mem.

3663

* mem_cgroup_largest_soft_limit_node returns

3675

* mem_cgroup_largest_soft_limit_node returns

3664

* NULL if no other cgroup is present on

3676

* NULL if no other cgroup is present on

3665

* the tree

3677

* the tree

3666

*/

3678

*/

3667

next_mz =

3679

next_mz =

3668

__mem_cgroup_largest_soft_limit_node(mctz);

3680

__mem_cgroup_largest_soft_limit_node(mctz);

3669

if (next_mz == mz)

3681

if (next_mz == mz)

3670

css_put(&next_mz->memcg->css);

3682

css_put(&next_mz->memcg->css);

3671

else /* next_mz == NULL or other memcg */

3683

else /* next_mz == NULL or other memcg */

3672

break;

3684

break;

3673

} while (1);

3685

} while (1);

3674

}

3686

}

3675

__mem_cgroup_remove_exceeded(mz, mctz);

3687

__mem_cgroup_remove_exceeded(mz, mctz);

3676

excess = soft_limit_excess(mz->memcg);

3688

excess = soft_limit_excess(mz->memcg);

3677

/*

3689

/*

3678

* One school of thought says that we should not add

3690

* One school of thought says that we should not add

3679

* back the node to the tree if reclaim returns 0.

3691

* back the node to the tree if reclaim returns 0.

3680

* But our reclaim could return 0, simply because due

3692

* But our reclaim could return 0, simply because due

3681

* to priority we are exposing a smaller subset of

3693

* to priority we are exposing a smaller subset of

3682

* memory to reclaim from. Consider this as a longer

3694

* memory to reclaim from. Consider this as a longer

3683

* term TODO.

3695

* term TODO.

3684

*/

3696

*/

3685

/* If excess == 0, no tree ops */

3697

/* If excess == 0, no tree ops */

3686

__mem_cgroup_insert_exceeded(mz, mctz, excess);

3698

__mem_cgroup_insert_exceeded(mz, mctz, excess);

3687

spin_unlock_irq(&mctz->lock);

3699

spin_unlock_irq(&mctz->lock);

3688

css_put(&mz->memcg->css);

3700

css_put(&mz->memcg->css);

3689

loop++;

3701

loop++;

3690

/*

3702

/*

3691

* Could not reclaim anything and there are no more

3703

* Could not reclaim anything and there are no more

3692

* mem cgroups to try or we seem to be looping without

3704

* mem cgroups to try or we seem to be looping without

3693

* reclaiming anything.

3705

* reclaiming anything.

3694

*/

3706

*/

3695

if (!nr_reclaimed &&

3707

if (!nr_reclaimed &&

3696

(next_mz == NULL ||

3708

(next_mz == NULL ||

3697

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3709

loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))

3698

break;

3710

break;

3699

} while (!nr_reclaimed);

3711

} while (!nr_reclaimed);

3700

if (next_mz)

3712

if (next_mz)

3701

css_put(&next_mz->memcg->css);

3713

css_put(&next_mz->memcg->css);

3702

return nr_reclaimed;

3714

return nr_reclaimed;

3703

}

3715

}

3704

3716

3705

/**

3717

/**

3706

* mem_cgroup_force_empty_list - clears LRU of a group

3718

* mem_cgroup_force_empty_list - clears LRU of a group

3707

* @memcg: group to clear

3719

* @memcg: group to clear

3708

* @node: NUMA node

3720

* @node: NUMA node

3709

* @zid: zone id

3721

* @zid: zone id

3710

* @lru: lru to to clear

3722

* @lru: lru to to clear

3711

*

3723

*

3712

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

3724

* Traverse a specified page_cgroup list and try to drop them all. This doesn't

3713

* reclaim the pages page themselves - pages are moved to the parent (or root)

3725

* reclaim the pages page themselves - pages are moved to the parent (or root)

3714

* group.

3726

* group.

3715

*/

3727

*/

3716

static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

3728

static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,

3717

int node, int zid, enum lru_list lru)

3729

int node, int zid, enum lru_list lru)

3718

{

3730

{

3719

struct lruvec *lruvec;

3731

struct lruvec *lruvec;

3720

unsigned long flags;

3732

unsigned long flags;

3721

struct list_head *list;

3733

struct list_head *list;

3722

struct page *busy;

3734

struct page *busy;

3723

struct zone *zone;

3735

struct zone *zone;

3724

3736

3725

zone = &NODE_DATA(node)->node_zones[zid];

3737

zone = &NODE_DATA(node)->node_zones[zid];

3726

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

3738

lruvec = mem_cgroup_zone_lruvec(zone, memcg);

3727

list = &lruvec->lists[lru];

3739

list = &lruvec->lists[lru];

3728

3740

3729

busy = NULL;

3741

busy = NULL;

3730

do {

3742

do {

3731

struct page_cgroup *pc;

3743

struct page_cgroup *pc;

3732

struct page *page;

3744

struct page *page;

3733

3745

3734

spin_lock_irqsave(&zone->lru_lock, flags);

3746

spin_lock_irqsave(&zone->lru_lock, flags);

3735

if (list_empty(list)) {

3747

if (list_empty(list)) {

3736

spin_unlock_irqrestore(&zone->lru_lock, flags);

3748

spin_unlock_irqrestore(&zone->lru_lock, flags);

3737

break;

3749

break;

3738

}

3750

}

3739

page = list_entry(list->prev, struct page, lru);

3751

page = list_entry(list->prev, struct page, lru);

3740

if (busy == page) {

3752

if (busy == page) {

3741

list_move(&page->lru, list);

3753

list_move(&page->lru, list);

3742

busy = NULL;

3754

busy = NULL;

3743

spin_unlock_irqrestore(&zone->lru_lock, flags);

3755

spin_unlock_irqrestore(&zone->lru_lock, flags);

3744

continue;

3756

continue;

3745

}

3757

}

3746

spin_unlock_irqrestore(&zone->lru_lock, flags);

3758

spin_unlock_irqrestore(&zone->lru_lock, flags);

3747

3759

3748

pc = lookup_page_cgroup(page);

3760

pc = lookup_page_cgroup(page);

3749

3761

3750

if (mem_cgroup_move_parent(page, pc, memcg)) {

3762

if (mem_cgroup_move_parent(page, pc, memcg)) {

3751

/* found lock contention or "pc" is obsolete. */

3763

/* found lock contention or "pc" is obsolete. */

3752

busy = page;

3764

busy = page;

3753

} else

3765

} else

3754

busy = NULL;

3766

busy = NULL;

3755

cond_resched();

3767

cond_resched();

3756

} while (!list_empty(list));

3768

} while (!list_empty(list));

3757

}

3769

}

3758

3770

3759

/*

3771

/*

3760

* make mem_cgroup's charge to be 0 if there is no task by moving

3772

* make mem_cgroup's charge to be 0 if there is no task by moving

3761

* all the charges and pages to the parent.

3773

* all the charges and pages to the parent.

3762

* This enables deleting this mem_cgroup.

3774

* This enables deleting this mem_cgroup.

3763

*

3775

*

3764

* Caller is responsible for holding css reference on the memcg.

3776

* Caller is responsible for holding css reference on the memcg.

3765

*/

3777

*/

3766

static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)

3778

static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)

3767

{

3779

{

3768

int node, zid;

3780

int node, zid;

3769

3781

3770

do {

3782

do {

3771

/* This is for making all *used* pages to be on LRU. */

3783

/* This is for making all *used* pages to be on LRU. */

3772

lru_add_drain_all();

3784

lru_add_drain_all();

3773

drain_all_stock_sync(memcg);

3785

drain_all_stock_sync(memcg);

3774

mem_cgroup_start_move(memcg);

3786

mem_cgroup_start_move(memcg);

3775

for_each_node_state(node, N_MEMORY) {

3787

for_each_node_state(node, N_MEMORY) {

3776

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

3788

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

3777

enum lru_list lru;

3789

enum lru_list lru;

3778

for_each_lru(lru) {

3790

for_each_lru(lru) {

3779

mem_cgroup_force_empty_list(memcg,

3791

mem_cgroup_force_empty_list(memcg,

3780

node, zid, lru);

3792

node, zid, lru);

3781

}

3793

}

3782

}

3794

}

3783

}

3795

}

3784

mem_cgroup_end_move(memcg);

3796

mem_cgroup_end_move(memcg);

3785

memcg_oom_recover(memcg);

3797

memcg_oom_recover(memcg);

3786

cond_resched();

3798

cond_resched();

3787

3799

3788

/*

3800

/*

3789

* Kernel memory may not necessarily be trackable to a specific

3801

* Kernel memory may not necessarily be trackable to a specific

3790

* process. So they are not migrated, and therefore we can't

3802

* process. So they are not migrated, and therefore we can't

3791

* expect their value to drop to 0 here.

3803

* expect their value to drop to 0 here.

3792

* Having res filled up with kmem only is enough.

3804

* Having res filled up with kmem only is enough.

3793

*

3805

*

3794

* This is a safety check because mem_cgroup_force_empty_list

3806

* This is a safety check because mem_cgroup_force_empty_list

3795

* could have raced with mem_cgroup_replace_page_cache callers

3807

* could have raced with mem_cgroup_replace_page_cache callers

3796

* so the lru seemed empty but the page could have been added

3808

* so the lru seemed empty but the page could have been added

3797

* right after the check. RES_USAGE should be safe as we always

3809

* right after the check. RES_USAGE should be safe as we always

3798

* charge before adding to the LRU.

3810

* charge before adding to the LRU.

3799

*/

3811

*/

3800

} while (page_counter_read(&memcg->memory) -

3812

} while (page_counter_read(&memcg->memory) -

3801

page_counter_read(&memcg->kmem) > 0);

3813

page_counter_read(&memcg->kmem) > 0);

3802

}

3814

}

3803

3815

3804

/*

3816

/*

3805

* Test whether @memcg has children, dead or alive. Note that this

3817

* Test whether @memcg has children, dead or alive. Note that this

3806

* function doesn't care whether @memcg has use_hierarchy enabled and

3818

* function doesn't care whether @memcg has use_hierarchy enabled and

3807

* returns %true if there are child csses according to the cgroup

3819

* returns %true if there are child csses according to the cgroup

3808

* hierarchy. Testing use_hierarchy is the caller's responsiblity.

3820

* hierarchy. Testing use_hierarchy is the caller's responsiblity.

3809

*/

3821

*/

3810

static inline bool memcg_has_children(struct mem_cgroup *memcg)

3822

static inline bool memcg_has_children(struct mem_cgroup *memcg)

3811

{

3823

{

3812

bool ret;

3824

bool ret;

3813

3825

3814

/*

3826

/*

3815

* The lock does not prevent addition or deletion of children, but

3827

* The lock does not prevent addition or deletion of children, but

3816

* it prevents a new child from being initialized based on this

3828

* it prevents a new child from being initialized based on this

3817

* parent in css_online(), so it's enough to decide whether

3829

* parent in css_online(), so it's enough to decide whether

3818

* hierarchically inherited attributes can still be changed or not.

3830

* hierarchically inherited attributes can still be changed or not.

3819

*/

3831

*/

3820

lockdep_assert_held(&memcg_create_mutex);

3832

lockdep_assert_held(&memcg_create_mutex);

3821

3833

3822

rcu_read_lock();

3834

rcu_read_lock();

3823

ret = css_next_child(NULL, &memcg->css);

3835

ret = css_next_child(NULL, &memcg->css);

3824

rcu_read_unlock();

3836

rcu_read_unlock();

3825

return ret;

3837

return ret;

3826

}

3838

}

3827

3839

3828

/*

3840

/*

3829

* Reclaims as many pages from the given memcg as possible and moves

3841

* Reclaims as many pages from the given memcg as possible and moves

3830

* the rest to the parent.

3842

* the rest to the parent.

3831

*

3843

*

3832

* Caller is responsible for holding css reference for memcg.

3844

* Caller is responsible for holding css reference for memcg.

3833

*/

3845

*/

3834

static int mem_cgroup_force_empty(struct mem_cgroup *memcg)

3846

static int mem_cgroup_force_empty(struct mem_cgroup *memcg)

3835

{

3847

{

3836

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3848

int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

3837

3849

3838

/* we call try-to-free pages for make this cgroup empty */

3850

/* we call try-to-free pages for make this cgroup empty */

3839

lru_add_drain_all();

3851

lru_add_drain_all();

3840

/* try to free all pages in this cgroup */

3852

/* try to free all pages in this cgroup */

3841

while (nr_retries && page_counter_read(&memcg->memory)) {

3853

while (nr_retries && page_counter_read(&memcg->memory)) {

3842

int progress;

3854

int progress;

3843

3855

3844

if (signal_pending(current))

3856

if (signal_pending(current))

3845

return -EINTR;

3857

return -EINTR;

3846

3858

3847

progress = try_to_free_mem_cgroup_pages(memcg, 1,

3859

progress = try_to_free_mem_cgroup_pages(memcg, 1,

3848

GFP_KERNEL, true);

3860

GFP_KERNEL, true);

3849

if (!progress) {

3861

if (!progress) {

3850

nr_retries--;

3862

nr_retries--;

3851

/* maybe some writeback is necessary */

3863

/* maybe some writeback is necessary */

3852

congestion_wait(BLK_RW_ASYNC, HZ/10);

3864

congestion_wait(BLK_RW_ASYNC, HZ/10);

3853

}

3865

}

3854

3866

3855

}

3867

}

3856

3868

3857

return 0;

3869

return 0;

3858

}

3870

}

3859

3871

3860

static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,

3872

static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,

3861

char *buf, size_t nbytes,

3873

char *buf, size_t nbytes,

3862

loff_t off)

3874

loff_t off)

3863

{

3875

{

3864

struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

3876

struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

3865

3877

3866

if (mem_cgroup_is_root(memcg))

3878

if (mem_cgroup_is_root(memcg))

3867

return -EINVAL;

3879

return -EINVAL;

3868

return mem_cgroup_force_empty(memcg) ?: nbytes;

3880

return mem_cgroup_force_empty(memcg) ?: nbytes;

3869

}

3881

}

3870

3882

3871

static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,

3883

static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,

3872

struct cftype *cft)

3884

struct cftype *cft)

3873

{

3885

{

3874

return mem_cgroup_from_css(css)->use_hierarchy;

3886

return mem_cgroup_from_css(css)->use_hierarchy;

3875

}

3887

}

3876

3888

3877

static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,

3889

static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,

3878

struct cftype *cft, u64 val)

3890

struct cftype *cft, u64 val)

3879

{

3891

{

3880

int retval = 0;

3892

int retval = 0;

3881

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

3893

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

3882

struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);

3894

struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);

3883

3895

3884

mutex_lock(&memcg_create_mutex);

3896

mutex_lock(&memcg_create_mutex);

3885

3897

3886

if (memcg->use_hierarchy == val)

3898

if (memcg->use_hierarchy == val)

3887

goto out;

3899

goto out;

3888

3900

3889

/*

3901

/*

3890

* If parent's use_hierarchy is set, we can't make any modifications

3902

* If parent's use_hierarchy is set, we can't make any modifications

3891

* in the child subtrees. If it is unset, then the change can

3903

* in the child subtrees. If it is unset, then the change can

3892

* occur, provided the current cgroup has no children.

3904

* occur, provided the current cgroup has no children.

3893

*

3905

*

3894

* For the root cgroup, parent_mem is NULL, we allow value to be

3906

* For the root cgroup, parent_mem is NULL, we allow value to be

3895

* set if there are no children.

3907

* set if there are no children.

3896

*/

3908

*/

3897

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

3909

if ((!parent_memcg || !parent_memcg->use_hierarchy) &&

3898

(val == 1 || val == 0)) {

3910

(val == 1 || val == 0)) {

3899

if (!memcg_has_children(memcg))

3911

if (!memcg_has_children(memcg))

3900

memcg->use_hierarchy = val;

3912

memcg->use_hierarchy = val;

3901

else

3913

else

3902

retval = -EBUSY;

3914

retval = -EBUSY;

3903

} else

3915

} else

3904

retval = -EINVAL;

3916

retval = -EINVAL;

3905

3917

3906

out:

3918

out:

3907

mutex_unlock(&memcg_create_mutex);

3919

mutex_unlock(&memcg_create_mutex);

3908

3920

3909

return retval;

3921

return retval;

3910

}

3922

}

3911

3923

3912

static unsigned long tree_stat(struct mem_cgroup *memcg,

3924

static unsigned long tree_stat(struct mem_cgroup *memcg,

3913

enum mem_cgroup_stat_index idx)

3925

enum mem_cgroup_stat_index idx)

3914

{

3926

{

3915

struct mem_cgroup *iter;

3927

struct mem_cgroup *iter;

3916

long val = 0;

3928

long val = 0;

3917

3929

3918

/* Per-cpu values can be negative, use a signed accumulator */

3930

/* Per-cpu values can be negative, use a signed accumulator */

3919

for_each_mem_cgroup_tree(iter, memcg)

3931

for_each_mem_cgroup_tree(iter, memcg)

3920

val += mem_cgroup_read_stat(iter, idx);

3932

val += mem_cgroup_read_stat(iter, idx);

3921

3933

3922

if (val < 0) /* race ? */

3934

if (val < 0) /* race ? */

3923

val = 0;

3935

val = 0;

3924

return val;

3936

return val;

3925

}

3937

}

3926

3938

3927

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

3939

static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)

3928

{

3940

{

3929

u64 val;

3941

u64 val;

3930

3942

3931

if (mem_cgroup_is_root(memcg)) {

3943

if (mem_cgroup_is_root(memcg)) {

3932

val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);

3944

val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);

3933

val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);

3945

val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);

3934

if (swap)

3946

if (swap)

3935

val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);

3947

val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);

3936

} else {

3948

} else {

3937

if (!swap)

3949

if (!swap)

3938

val = page_counter_read(&memcg->memory);

3950

val = page_counter_read(&memcg->memory);

3939

else

3951

else

3940

val = page_counter_read(&memcg->memsw);

3952

val = page_counter_read(&memcg->memsw);

3941

}

3953

}

3942

return val << PAGE_SHIFT;

3954

return val << PAGE_SHIFT;

3943

}

3955

}

3944

3956

3945

enum {

3957

enum {

3946

RES_USAGE,

3958

RES_USAGE,

3947

RES_LIMIT,

3959

RES_LIMIT,

3948

RES_MAX_USAGE,

3960

RES_MAX_USAGE,

3949

RES_FAILCNT,

3961

RES_FAILCNT,

3950

RES_SOFT_LIMIT,

3962

RES_SOFT_LIMIT,

3951

};

3963

};

3952

3964

3953

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,

3965

static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,

3954

struct cftype *cft)

3966

struct cftype *cft)

3955

{

3967

{

3956

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

3968

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

3957

struct page_counter *counter;

3969

struct page_counter *counter;

3958

3970

3959

switch (MEMFILE_TYPE(cft->private)) {

3971

switch (MEMFILE_TYPE(cft->private)) {

3960

case _MEM:

3972

case _MEM:

3961

counter = &memcg->memory;

3973

counter = &memcg->memory;

3962

break;

3974

break;

3963

case _MEMSWAP:

3975

case _MEMSWAP:

3964

counter = &memcg->memsw;

3976

counter = &memcg->memsw;

3965

break;

3977

break;

3966

case _KMEM:

3978

case _KMEM:

3967

counter = &memcg->kmem;

3979

counter = &memcg->kmem;

3968

break;

3980

break;

3969

default:

3981

default:

3970

BUG();

3982

BUG();

3971

}

3983

}

3972

3984

3973

switch (MEMFILE_ATTR(cft->private)) {

3985

switch (MEMFILE_ATTR(cft->private)) {

3974

case RES_USAGE:

3986

case RES_USAGE:

3975

if (counter == &memcg->memory)

3987

if (counter == &memcg->memory)

3976

return mem_cgroup_usage(memcg, false);

3988

return mem_cgroup_usage(memcg, false);

3977

if (counter == &memcg->memsw)

3989

if (counter == &memcg->memsw)

3978

return mem_cgroup_usage(memcg, true);

3990

return mem_cgroup_usage(memcg, true);

3979

return (u64)page_counter_read(counter) * PAGE_SIZE;

3991

return (u64)page_counter_read(counter) * PAGE_SIZE;

3980

case RES_LIMIT:

3992

case RES_LIMIT:

3981

return (u64)counter->limit * PAGE_SIZE;

3993

return (u64)counter->limit * PAGE_SIZE;

3982

case RES_MAX_USAGE:

3994

case RES_MAX_USAGE:

3983

return (u64)counter->watermark * PAGE_SIZE;

3995

return (u64)counter->watermark * PAGE_SIZE;

3984

case RES_FAILCNT:

3996

case RES_FAILCNT:

3985

return counter->failcnt;

3997

return counter->failcnt;

3986

case RES_SOFT_LIMIT:

3998

case RES_SOFT_LIMIT:

3987

return (u64)memcg->soft_limit * PAGE_SIZE;

3999

return (u64)memcg->soft_limit * PAGE_SIZE;

3988

default:

4000

default:

3989

BUG();

4001

BUG();

3990

}

4002

}

3991

}

4003

}

3992

4004

3993

#ifdef CONFIG_MEMCG_KMEM

4005

#ifdef CONFIG_MEMCG_KMEM

3994

/* should be called with activate_kmem_mutex held */

4006

/* should be called with activate_kmem_mutex held */

3995

static int __memcg_activate_kmem(struct mem_cgroup *memcg,

4007

static int __memcg_activate_kmem(struct mem_cgroup *memcg,

3996

unsigned long nr_pages)

4008

unsigned long nr_pages)

3997

{

4009

{

3998

int err = 0;

4010

int err = 0;

3999

int memcg_id;

4011

int memcg_id;

4000

4012

4001

if (memcg_kmem_is_active(memcg))

4013

if (memcg_kmem_is_active(memcg))

4002

return 0;

4014

return 0;

4003

4015

4004

/*

4016

/*

4005

* We are going to allocate memory for data shared by all memory

4017

* We are going to allocate memory for data shared by all memory

4006

* cgroups so let's stop accounting here.

4018

* cgroups so let's stop accounting here.

4007

*/

4019

*/

4008

memcg_stop_kmem_account();

4020

memcg_stop_kmem_account();

4009

4021

4010

/*

4022

/*

4011

* For simplicity, we won't allow this to be disabled. It also can't

4023

* For simplicity, we won't allow this to be disabled. It also can't

4012

* be changed if the cgroup has children already, or if tasks had

4024

* be changed if the cgroup has children already, or if tasks had

4013

* already joined.

4025

* already joined.

4014

*

4026

*

4015

* If tasks join before we set the limit, a person looking at

4027

* If tasks join before we set the limit, a person looking at

4016

* kmem.usage_in_bytes will have no way to determine when it took

4028

* kmem.usage_in_bytes will have no way to determine when it took

4017

* place, which makes the value quite meaningless.

4029

* place, which makes the value quite meaningless.

4018

*

4030

*

4019

* After it first became limited, changes in the value of the limit are

4031

* After it first became limited, changes in the value of the limit are

4020

* of course permitted.

4032

* of course permitted.

4021

*/

4033

*/

4022

mutex_lock(&memcg_create_mutex);

4034

mutex_lock(&memcg_create_mutex);

4023

if (cgroup_has_tasks(memcg->css.cgroup) ||

4035

if (cgroup_has_tasks(memcg->css.cgroup) ||

4024

(memcg->use_hierarchy && memcg_has_children(memcg)))

4036

(memcg->use_hierarchy && memcg_has_children(memcg)))

4025

err = -EBUSY;

4037

err = -EBUSY;

4026

mutex_unlock(&memcg_create_mutex);

4038

mutex_unlock(&memcg_create_mutex);

4027

if (err)

4039

if (err)

4028

goto out;

4040

goto out;

4029

4041

4030

memcg_id = memcg_alloc_cache_id();

4042

memcg_id = memcg_alloc_cache_id();

4031

if (memcg_id < 0) {

4043

if (memcg_id < 0) {

4032

err = memcg_id;

4044

err = memcg_id;

4033

goto out;

4045

goto out;

4034

}

4046

}

4035

4047

4036

memcg->kmemcg_id = memcg_id;

4048

memcg->kmemcg_id = memcg_id;

4037

INIT_LIST_HEAD(&memcg->memcg_slab_caches);

4049

INIT_LIST_HEAD(&memcg->memcg_slab_caches);

4038

4050

4039

/*

4051

/*

4040

* We couldn't have accounted to this cgroup, because it hasn't got the

4052

* We couldn't have accounted to this cgroup, because it hasn't got the

4041

* active bit set yet, so this should succeed.

4053

* active bit set yet, so this should succeed.

4042

*/

4054

*/

4043

err = page_counter_limit(&memcg->kmem, nr_pages);

4055

err = page_counter_limit(&memcg->kmem, nr_pages);

4044

VM_BUG_ON(err);

4056

VM_BUG_ON(err);

4045

4057

4046

static_key_slow_inc(&memcg_kmem_enabled_key);

4058

static_key_slow_inc(&memcg_kmem_enabled_key);

4047

/*

4059

/*

4048

* Setting the active bit after enabling static branching will

4060

* Setting the active bit after enabling static branching will

4049

* guarantee no one starts accounting before all call sites are

4061

* guarantee no one starts accounting before all call sites are

4050

* patched.

4062

* patched.

4051

*/

4063

*/

4052

memcg_kmem_set_active(memcg);

4064

memcg_kmem_set_active(memcg);

4053

out:

4065

out:

4054

memcg_resume_kmem_account();

4066

memcg_resume_kmem_account();

4055

return err;

4067

return err;

4056

}

4068

}

4057

4069

4058

static int memcg_activate_kmem(struct mem_cgroup *memcg,

4070

static int memcg_activate_kmem(struct mem_cgroup *memcg,

4059

unsigned long nr_pages)

4071

unsigned long nr_pages)

4060

{

4072

{

4061

int ret;

4073

int ret;

4062

4074

4063

mutex_lock(&activate_kmem_mutex);

4075

mutex_lock(&activate_kmem_mutex);

4064

ret = __memcg_activate_kmem(memcg, nr_pages);

4076

ret = __memcg_activate_kmem(memcg, nr_pages);

4065

mutex_unlock(&activate_kmem_mutex);

4077

mutex_unlock(&activate_kmem_mutex);

4066

return ret;

4078

return ret;

4067

}

4079

}

4068

4080

4069

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

4081

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

4070

unsigned long limit)

4082

unsigned long limit)

4071

{

4083

{

4072

int ret;

4084

int ret;

4073

4085

4074

mutex_lock(&memcg_limit_mutex);

4086

mutex_lock(&memcg_limit_mutex);

4075

if (!memcg_kmem_is_active(memcg))

4087

if (!memcg_kmem_is_active(memcg))

4076

ret = memcg_activate_kmem(memcg, limit);

4088

ret = memcg_activate_kmem(memcg, limit);

4077

else

4089

else

4078

ret = page_counter_limit(&memcg->kmem, limit);

4090

ret = page_counter_limit(&memcg->kmem, limit);

4079

mutex_unlock(&memcg_limit_mutex);

4091

mutex_unlock(&memcg_limit_mutex);

4080

return ret;

4092

return ret;

4081

}

4093

}

4082

4094

4083

static int memcg_propagate_kmem(struct mem_cgroup *memcg)

4095

static int memcg_propagate_kmem(struct mem_cgroup *memcg)

4084

{

4096

{

4085

int ret = 0;

4097

int ret = 0;

4086

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

4098

struct mem_cgroup *parent = parent_mem_cgroup(memcg);

4087

4099

4088

if (!parent)

4100

if (!parent)

4089

return 0;

4101

return 0;

4090

4102

4091

mutex_lock(&activate_kmem_mutex);

4103

mutex_lock(&activate_kmem_mutex);

4092

/*

4104

/*

4093

* If the parent cgroup is not kmem-active now, it cannot be activated

4105

* If the parent cgroup is not kmem-active now, it cannot be activated

4094

* after this point, because it has at least one child already.

4106

* after this point, because it has at least one child already.

4095

*/

4107

*/

4096

if (memcg_kmem_is_active(parent))

4108

if (memcg_kmem_is_active(parent))

4097

ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);

4109

ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);

4098

mutex_unlock(&activate_kmem_mutex);

4110

mutex_unlock(&activate_kmem_mutex);

4099

return ret;

4111

return ret;

4100

}

4112

}

4101

#else

4113

#else

4102

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

4114

static int memcg_update_kmem_limit(struct mem_cgroup *memcg,

4103

unsigned long limit)

4115

unsigned long limit)

4104

{

4116

{

4105

return -EINVAL;

4117

return -EINVAL;

4106

}

4118

}

4107

#endif /* CONFIG_MEMCG_KMEM */

4119

#endif /* CONFIG_MEMCG_KMEM */

4108

4120

4109

/*

4121

/*

4110

* The user of this function is...

4122

* The user of this function is...

4111

* RES_LIMIT.

4123

* RES_LIMIT.

4112

*/

4124

*/

4113

static ssize_t mem_cgroup_write(struct kernfs_open_file *of,

4125

static ssize_t mem_cgroup_write(struct kernfs_open_file *of,

4114

char *buf, size_t nbytes, loff_t off)

4126

char *buf, size_t nbytes, loff_t off)

4115

{

4127

{

4116

struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

4128

struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

4117

unsigned long nr_pages;

4129

unsigned long nr_pages;

4118

int ret;

4130

int ret;

4119

4131

4120

buf = strstrip(buf);

4132

buf = strstrip(buf);

4121

ret = page_counter_memparse(buf, &nr_pages);

4133

ret = page_counter_memparse(buf, &nr_pages);

4122

if (ret)

4134

if (ret)

4123

return ret;

4135

return ret;

4124

4136

4125

switch (MEMFILE_ATTR(of_cft(of)->private)) {

4137

switch (MEMFILE_ATTR(of_cft(of)->private)) {

4126

case RES_LIMIT:

4138

case RES_LIMIT:

4127

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

4139

if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */

4128

ret = -EINVAL;

4140

ret = -EINVAL;

4129

break;

4141

break;

4130

}

4142

}

4131

switch (MEMFILE_TYPE(of_cft(of)->private)) {

4143

switch (MEMFILE_TYPE(of_cft(of)->private)) {

4132

case _MEM:

4144

case _MEM:

4133

ret = mem_cgroup_resize_limit(memcg, nr_pages);

4145

ret = mem_cgroup_resize_limit(memcg, nr_pages);

4134

break;

4146

break;

4135

case _MEMSWAP:

4147

case _MEMSWAP:

4136

ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);

4148

ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);

4137

break;

4149

break;

4138

case _KMEM:

4150

case _KMEM:

4139

ret = memcg_update_kmem_limit(memcg, nr_pages);

4151

ret = memcg_update_kmem_limit(memcg, nr_pages);

4140

break;

4152

break;

4141

}

4153

}

4142

break;

4154

break;

4143

case RES_SOFT_LIMIT:

4155

case RES_SOFT_LIMIT:

4144

memcg->soft_limit = nr_pages;

4156

memcg->soft_limit = nr_pages;

4145

ret = 0;

4157

ret = 0;

4146

break;

4158

break;

4147

}

4159

}

4148

return ret ?: nbytes;

4160

return ret ?: nbytes;

4149

}

4161

}

4150

4162

4151

static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,

4163

static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,

4152

size_t nbytes, loff_t off)

4164

size_t nbytes, loff_t off)

4153

{

4165

{

4154

struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

4166

struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));

4155

struct page_counter *counter;

4167

struct page_counter *counter;

4156

4168

4157

switch (MEMFILE_TYPE(of_cft(of)->private)) {

4169

switch (MEMFILE_TYPE(of_cft(of)->private)) {

4158

case _MEM:

4170

case _MEM:

4159

counter = &memcg->memory;

4171

counter = &memcg->memory;

4160

break;

4172

break;

4161

case _MEMSWAP:

4173

case _MEMSWAP:

4162

counter = &memcg->memsw;

4174

counter = &memcg->memsw;

4163

break;

4175

break;

4164

case _KMEM:

4176

case _KMEM:

4165

counter = &memcg->kmem;

4177

counter = &memcg->kmem;

4166

break;

4178

break;

4167

default:

4179

default:

4168

BUG();

4180

BUG();

4169

}

4181

}

4170

4182

4171

switch (MEMFILE_ATTR(of_cft(of)->private)) {

4183

switch (MEMFILE_ATTR(of_cft(of)->private)) {

4172

case RES_MAX_USAGE:

4184

case RES_MAX_USAGE:

4173

page_counter_reset_watermark(counter);

4185

page_counter_reset_watermark(counter);

4174

break;

4186

break;

4175

case RES_FAILCNT:

4187

case RES_FAILCNT:

4176

counter->failcnt = 0;

4188

counter->failcnt = 0;

4177

break;

4189

break;

4178

default:

4190

default:

4179

BUG();

4191

BUG();

4180

}

4192

}

4181

4193

4182

return nbytes;

4194

return nbytes;

4183

}

4195

}

4184

4196

4185

static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,

4197

static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,

4186

struct cftype *cft)

4198

struct cftype *cft)

4187

{

4199

{

4188

return mem_cgroup_from_css(css)->move_charge_at_immigrate;

4200

return mem_cgroup_from_css(css)->move_charge_at_immigrate;

4189

}

4201

}

4190

4202

4191

#ifdef CONFIG_MMU

4203

#ifdef CONFIG_MMU

4192

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

4204

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

4193

struct cftype *cft, u64 val)

4205

struct cftype *cft, u64 val)

4194

{

4206

{

4195

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4207

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4196

4208

4197

if (val >= (1 << NR_MOVE_TYPE))

4209

if (val >= (1 << NR_MOVE_TYPE))

4198

return -EINVAL;

4210

return -EINVAL;

4199

4211

4200

/*

4212

/*

4201

* No kind of locking is needed in here, because ->can_attach() will

4213

* No kind of locking is needed in here, because ->can_attach() will

4202

* check this value once in the beginning of the process, and then carry

4214

* check this value once in the beginning of the process, and then carry

4203

* on with stale data. This means that changes to this value will only

4215

* on with stale data. This means that changes to this value will only

4204

* affect task migrations starting after the change.

4216

* affect task migrations starting after the change.

4205

*/

4217

*/

4206

memcg->move_charge_at_immigrate = val;

4218

memcg->move_charge_at_immigrate = val;

4207

return 0;

4219

return 0;

4208

}

4220

}

4209

#else

4221

#else

4210

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

4222

static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,

4211

struct cftype *cft, u64 val)

4223

struct cftype *cft, u64 val)

4212

{

4224

{

4213

return -ENOSYS;

4225

return -ENOSYS;

4214

}

4226

}

4215

#endif

4227

#endif

4216

4228

4217

#ifdef CONFIG_NUMA

4229

#ifdef CONFIG_NUMA

4218

static int memcg_numa_stat_show(struct seq_file *m, void *v)

4230

static int memcg_numa_stat_show(struct seq_file *m, void *v)

4219

{

4231

{

4220

struct numa_stat {

4232

struct numa_stat {

4221

const char *name;

4233

const char *name;

4222

unsigned int lru_mask;

4234

unsigned int lru_mask;

4223

};

4235

};

4224

4236

4225

static const struct numa_stat stats[] = {

4237

static const struct numa_stat stats[] = {

4226

{ "total", LRU_ALL },

4238

{ "total", LRU_ALL },

4227

{ "file", LRU_ALL_FILE },

4239

{ "file", LRU_ALL_FILE },

4228

{ "anon", LRU_ALL_ANON },

4240

{ "anon", LRU_ALL_ANON },

4229

{ "unevictable", BIT(LRU_UNEVICTABLE) },

4241

{ "unevictable", BIT(LRU_UNEVICTABLE) },

4230

};

4242

};

4231

const struct numa_stat *stat;

4243

const struct numa_stat *stat;

4232

int nid;

4244

int nid;

4233

unsigned long nr;

4245

unsigned long nr;

4234

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

4246

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

4235

4247

4236

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

4248

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

4237

nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);

4249

nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);

4238

seq_printf(m, "%s=%lu", stat->name, nr);

4250

seq_printf(m, "%s=%lu", stat->name, nr);

4239

for_each_node_state(nid, N_MEMORY) {

4251

for_each_node_state(nid, N_MEMORY) {

4240

nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4252

nr = mem_cgroup_node_nr_lru_pages(memcg, nid,

4241

stat->lru_mask);

4253

stat->lru_mask);

4242

seq_printf(m, " N%d=%lu", nid, nr);

4254

seq_printf(m, " N%d=%lu", nid, nr);

4243

}

4255

}

4244

seq_putc(m, '\n');

4256

seq_putc(m, '\n');

4245

}

4257

}

4246

4258

4247

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

4259

for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {

4248

struct mem_cgroup *iter;

4260

struct mem_cgroup *iter;

4249

4261

4250

nr = 0;

4262

nr = 0;

4251

for_each_mem_cgroup_tree(iter, memcg)

4263

for_each_mem_cgroup_tree(iter, memcg)

4252

nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);

4264

nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);

4253

seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);

4265

seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);

4254

for_each_node_state(nid, N_MEMORY) {

4266

for_each_node_state(nid, N_MEMORY) {

4255

nr = 0;

4267

nr = 0;

4256

for_each_mem_cgroup_tree(iter, memcg)

4268

for_each_mem_cgroup_tree(iter, memcg)

4257

nr += mem_cgroup_node_nr_lru_pages(

4269

nr += mem_cgroup_node_nr_lru_pages(

4258

iter, nid, stat->lru_mask);

4270

iter, nid, stat->lru_mask);

4259

seq_printf(m, " N%d=%lu", nid, nr);

4271

seq_printf(m, " N%d=%lu", nid, nr);

4260

}

4272

}

4261

seq_putc(m, '\n');

4273

seq_putc(m, '\n');

4262

}

4274

}

4263

4275

4264

return 0;

4276

return 0;

4265

}

4277

}

4266

#endif /* CONFIG_NUMA */

4278

#endif /* CONFIG_NUMA */

4267

4279

4268

static inline void mem_cgroup_lru_names_not_uptodate(void)

4280

static inline void mem_cgroup_lru_names_not_uptodate(void)

4269

{

4281

{

4270

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

4282

BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);

4271

}

4283

}

4272

4284

4273

static int memcg_stat_show(struct seq_file *m, void *v)

4285

static int memcg_stat_show(struct seq_file *m, void *v)

4274

{

4286

{

4275

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

4287

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));

4276

unsigned long memory, memsw;

4288

unsigned long memory, memsw;

4277

struct mem_cgroup *mi;

4289

struct mem_cgroup *mi;

4278

unsigned int i;

4290

unsigned int i;

4279

4291

4280

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4292

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4281

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4293

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4282

continue;

4294

continue;

4283

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

4295

seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],

4284

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

4296

mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);

4285

}

4297

}

4286

4298

4287

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

4299

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)

4288

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

4300

seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],

4289

mem_cgroup_read_events(memcg, i));

4301

mem_cgroup_read_events(memcg, i));

4290

4302

4291

for (i = 0; i < NR_LRU_LISTS; i++)

4303

for (i = 0; i < NR_LRU_LISTS; i++)

4292

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

4304

seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],

4293

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

4305

mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);

4294

4306

4295

/* Hierarchical information */

4307

/* Hierarchical information */

4296

memory = memsw = PAGE_COUNTER_MAX;

4308

memory = memsw = PAGE_COUNTER_MAX;

4297

for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {

4309

for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {

4298

memory = min(memory, mi->memory.limit);

4310

memory = min(memory, mi->memory.limit);

4299

memsw = min(memsw, mi->memsw.limit);

4311

memsw = min(memsw, mi->memsw.limit);

4300

}

4312

}

4301

seq_printf(m, "hierarchical_memory_limit %llu\n",

4313

seq_printf(m, "hierarchical_memory_limit %llu\n",

4302

(u64)memory * PAGE_SIZE);

4314

(u64)memory * PAGE_SIZE);

4303

if (do_swap_account)

4315

if (do_swap_account)

4304

seq_printf(m, "hierarchical_memsw_limit %llu\n",

4316

seq_printf(m, "hierarchical_memsw_limit %llu\n",

4305

(u64)memsw * PAGE_SIZE);

4317

(u64)memsw * PAGE_SIZE);

4306

4318

4307

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4319

for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {

4308

long long val = 0;

4320

long long val = 0;

4309

4321

4310

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4322

if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)

4311

continue;

4323

continue;

4312

for_each_mem_cgroup_tree(mi, memcg)

4324

for_each_mem_cgroup_tree(mi, memcg)

4313

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

4325

val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;

4314

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

4326

seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);

4315

}

4327

}

4316

4328

4317

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

4329

for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {

4318

unsigned long long val = 0;

4330

unsigned long long val = 0;

4319

4331

4320

for_each_mem_cgroup_tree(mi, memcg)

4332

for_each_mem_cgroup_tree(mi, memcg)

4321

val += mem_cgroup_read_events(mi, i);

4333

val += mem_cgroup_read_events(mi, i);

4322

seq_printf(m, "total_%s %llu\n",

4334

seq_printf(m, "total_%s %llu\n",

4323

mem_cgroup_events_names[i], val);

4335

mem_cgroup_events_names[i], val);

4324

}

4336

}

4325

4337

4326

for (i = 0; i < NR_LRU_LISTS; i++) {

4338

for (i = 0; i < NR_LRU_LISTS; i++) {

4327

unsigned long long val = 0;

4339

unsigned long long val = 0;

4328

4340

4329

for_each_mem_cgroup_tree(mi, memcg)

4341

for_each_mem_cgroup_tree(mi, memcg)

4330

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

4342

val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;

4331

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

4343

seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);

4332

}

4344

}

4333

4345

4334

#ifdef CONFIG_DEBUG_VM

4346

#ifdef CONFIG_DEBUG_VM

4335

{

4347

{

4336

int nid, zid;

4348

int nid, zid;

4337

struct mem_cgroup_per_zone *mz;

4349

struct mem_cgroup_per_zone *mz;

4338

struct zone_reclaim_stat *rstat;

4350

struct zone_reclaim_stat *rstat;

4339

unsigned long recent_rotated[2] = {0, 0};

4351

unsigned long recent_rotated[2] = {0, 0};

4340

unsigned long recent_scanned[2] = {0, 0};

4352

unsigned long recent_scanned[2] = {0, 0};

4341

4353

4342

for_each_online_node(nid)

4354

for_each_online_node(nid)

4343

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4355

for (zid = 0; zid < MAX_NR_ZONES; zid++) {

4344

mz = &memcg->nodeinfo[nid]->zoneinfo[zid];

4356

mz = &memcg->nodeinfo[nid]->zoneinfo[zid];

4345

rstat = &mz->lruvec.reclaim_stat;

4357

rstat = &mz->lruvec.reclaim_stat;

4346

4358

4347

recent_rotated[0] += rstat->recent_rotated[0];

4359

recent_rotated[0] += rstat->recent_rotated[0];

4348

recent_rotated[1] += rstat->recent_rotated[1];

4360

recent_rotated[1] += rstat->recent_rotated[1];

4349

recent_scanned[0] += rstat->recent_scanned[0];

4361

recent_scanned[0] += rstat->recent_scanned[0];

4350

recent_scanned[1] += rstat->recent_scanned[1];

4362

recent_scanned[1] += rstat->recent_scanned[1];

4351

}

4363

}

4352

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

4364

seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);

4353

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

4365

seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);

4354

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

4366

seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);

4355

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

4367

seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);

4356

}

4368

}

4357

#endif

4369

#endif

4358

4370

4359

return 0;

4371

return 0;

4360

}

4372

}

4361

4373

4362

static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,

4374

static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,

4363

struct cftype *cft)

4375

struct cftype *cft)

4364

{

4376

{

4365

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4377

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4366

4378

4367

return mem_cgroup_swappiness(memcg);

4379

return mem_cgroup_swappiness(memcg);

4368

}

4380

}

4369

4381

4370

static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,

4382

static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,

4371

struct cftype *cft, u64 val)

4383

struct cftype *cft, u64 val)

4372

{

4384

{

4373

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4385

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4374

4386

4375

if (val > 100)

4387

if (val > 100)

4376

return -EINVAL;

4388

return -EINVAL;

4377

4389

4378

if (css->parent)

4390

if (css->parent)

4379

memcg->swappiness = val;

4391

memcg->swappiness = val;

4380

else

4392

else

4381

vm_swappiness = val;

4393

vm_swappiness = val;

4382

4394

4383

return 0;

4395

return 0;

4384

}

4396

}

4385

4397

4386

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4398

static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)

4387

{

4399

{

4388

struct mem_cgroup_threshold_ary *t;

4400

struct mem_cgroup_threshold_ary *t;

4389

unsigned long usage;

4401

unsigned long usage;

4390

int i;

4402

int i;

4391

4403

4392

rcu_read_lock();

4404

rcu_read_lock();

4393

if (!swap)

4405

if (!swap)

4394

t = rcu_dereference(memcg->thresholds.primary);

4406

t = rcu_dereference(memcg->thresholds.primary);

4395

else

4407

else

4396

t = rcu_dereference(memcg->memsw_thresholds.primary);

4408

t = rcu_dereference(memcg->memsw_thresholds.primary);

4397

4409

4398

if (!t)

4410

if (!t)

4399

goto unlock;

4411

goto unlock;

4400

4412

4401

usage = mem_cgroup_usage(memcg, swap);

4413

usage = mem_cgroup_usage(memcg, swap);

4402

4414

4403

/*

4415

/*

4404

* current_threshold points to threshold just below or equal to usage.

4416

* current_threshold points to threshold just below or equal to usage.

4405

* If it's not true, a threshold was crossed after last

4417

* If it's not true, a threshold was crossed after last

4406

* call of __mem_cgroup_threshold().

4418

* call of __mem_cgroup_threshold().

4407

*/

4419

*/

4408

i = t->current_threshold;

4420

i = t->current_threshold;

4409

4421

4410

/*

4422

/*

4411

* Iterate backward over array of thresholds starting from

4423

* Iterate backward over array of thresholds starting from

4412

* current_threshold and check if a threshold is crossed.

4424

* current_threshold and check if a threshold is crossed.

4413

* If none of thresholds below usage is crossed, we read

4425

* If none of thresholds below usage is crossed, we read

4414

* only one element of the array here.

4426

* only one element of the array here.

4415

*/

4427

*/

4416

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4428

for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)

4417

eventfd_signal(t->entries[i].eventfd, 1);

4429

eventfd_signal(t->entries[i].eventfd, 1);

4418

4430

4419

/* i = current_threshold + 1 */

4431

/* i = current_threshold + 1 */

4420

i++;

4432

i++;

4421

4433

4422

/*

4434

/*

4423

* Iterate forward over array of thresholds starting from

4435

* Iterate forward over array of thresholds starting from

4424

* current_threshold+1 and check if a threshold is crossed.

4436

* current_threshold+1 and check if a threshold is crossed.

4425

* If none of thresholds above usage is crossed, we read

4437

* If none of thresholds above usage is crossed, we read

4426

* only one element of the array here.

4438

* only one element of the array here.

4427

*/

4439

*/

4428

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4440

for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)

4429

eventfd_signal(t->entries[i].eventfd, 1);

4441

eventfd_signal(t->entries[i].eventfd, 1);

4430

4442

4431

/* Update current_threshold */

4443

/* Update current_threshold */

4432

t->current_threshold = i - 1;

4444

t->current_threshold = i - 1;

4433

unlock:

4445

unlock:

4434

rcu_read_unlock();

4446

rcu_read_unlock();

4435

}

4447

}

4436

4448

4437

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4449

static void mem_cgroup_threshold(struct mem_cgroup *memcg)

4438

{

4450

{

4439

while (memcg) {

4451

while (memcg) {

4440

__mem_cgroup_threshold(memcg, false);

4452

__mem_cgroup_threshold(memcg, false);

4441

if (do_swap_account)

4453

if (do_swap_account)

4442

__mem_cgroup_threshold(memcg, true);

4454

__mem_cgroup_threshold(memcg, true);

4443

4455

4444

memcg = parent_mem_cgroup(memcg);

4456

memcg = parent_mem_cgroup(memcg);

4445

}

4457

}

4446

}

4458

}

4447

4459

4448

static int compare_thresholds(const void *a, const void *b)

4460

static int compare_thresholds(const void *a, const void *b)

4449

{

4461

{

4450

const struct mem_cgroup_threshold *_a = a;

4462

const struct mem_cgroup_threshold *_a = a;

4451

const struct mem_cgroup_threshold *_b = b;

4463

const struct mem_cgroup_threshold *_b = b;

4452

4464

4453

if (_a->threshold > _b->threshold)

4465

if (_a->threshold > _b->threshold)

4454

return 1;

4466

return 1;

4455

4467

4456

if (_a->threshold < _b->threshold)

4468

if (_a->threshold < _b->threshold)

4457

return -1;

4469

return -1;

4458

4470

4459

return 0;

4471

return 0;

4460

}

4472

}

4461

4473

4462

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

4474

static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

4463

{

4475

{

4464

struct mem_cgroup_eventfd_list *ev;

4476

struct mem_cgroup_eventfd_list *ev;

4465

4477

4466

spin_lock(&memcg_oom_lock);

4478

spin_lock(&memcg_oom_lock);

4467

4479

4468

list_for_each_entry(ev, &memcg->oom_notify, list)

4480

list_for_each_entry(ev, &memcg->oom_notify, list)

4469

eventfd_signal(ev->eventfd, 1);

4481

eventfd_signal(ev->eventfd, 1);

4470

4482

4471

spin_unlock(&memcg_oom_lock);

4483

spin_unlock(&memcg_oom_lock);

4472

return 0;

4484

return 0;

4473

}

4485

}

4474

4486

4475

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

4487

static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)

4476

{

4488

{

4477

struct mem_cgroup *iter;

4489

struct mem_cgroup *iter;

4478

4490

4479

for_each_mem_cgroup_tree(iter, memcg)

4491

for_each_mem_cgroup_tree(iter, memcg)

4480

mem_cgroup_oom_notify_cb(iter);

4492

mem_cgroup_oom_notify_cb(iter);

4481

}

4493

}

4482

4494

4483

static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

4495

static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

4484

struct eventfd_ctx *eventfd, const char *args, enum res_type type)

4496

struct eventfd_ctx *eventfd, const char *args, enum res_type type)

4485

{

4497

{

4486

struct mem_cgroup_thresholds *thresholds;

4498

struct mem_cgroup_thresholds *thresholds;

4487

struct mem_cgroup_threshold_ary *new;

4499

struct mem_cgroup_threshold_ary *new;

4488

unsigned long threshold;

4500

unsigned long threshold;

4489

unsigned long usage;

4501

unsigned long usage;

4490

int i, size, ret;

4502

int i, size, ret;

4491

4503

4492

ret = page_counter_memparse(args, &threshold);

4504

ret = page_counter_memparse(args, &threshold);

4493

if (ret)

4505

if (ret)

4494

return ret;

4506

return ret;

4495

4507

4496

mutex_lock(&memcg->thresholds_lock);

4508

mutex_lock(&memcg->thresholds_lock);

4497

4509

4498

if (type == _MEM) {

4510

if (type == _MEM) {

4499

thresholds = &memcg->thresholds;

4511

thresholds = &memcg->thresholds;

4500

usage = mem_cgroup_usage(memcg, false);

4512

usage = mem_cgroup_usage(memcg, false);

4501

} else if (type == _MEMSWAP) {

4513

} else if (type == _MEMSWAP) {

4502

thresholds = &memcg->memsw_thresholds;

4514

thresholds = &memcg->memsw_thresholds;

4503

usage = mem_cgroup_usage(memcg, true);

4515

usage = mem_cgroup_usage(memcg, true);

4504

} else

4516

} else

4505

BUG();

4517

BUG();

4506

4518

4507

/* Check if a threshold crossed before adding a new one */

4519

/* Check if a threshold crossed before adding a new one */

4508

if (thresholds->primary)

4520

if (thresholds->primary)

4509

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4521

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4510

4522

4511

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4523

size = thresholds->primary ? thresholds->primary->size + 1 : 1;

4512

4524

4513

/* Allocate memory for new array of thresholds */

4525

/* Allocate memory for new array of thresholds */

4514

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4526

new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),

4515

GFP_KERNEL);

4527

GFP_KERNEL);

4516

if (!new) {

4528

if (!new) {

4517

ret = -ENOMEM;

4529

ret = -ENOMEM;

4518

goto unlock;

4530

goto unlock;

4519

}

4531

}

4520

new->size = size;

4532

new->size = size;

4521

4533

4522

/* Copy thresholds (if any) to new array */

4534

/* Copy thresholds (if any) to new array */

4523

if (thresholds->primary) {

4535

if (thresholds->primary) {

4524

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4536

memcpy(new->entries, thresholds->primary->entries, (size - 1) *

4525

sizeof(struct mem_cgroup_threshold));

4537

sizeof(struct mem_cgroup_threshold));

4526

}

4538

}

4527

4539

4528

/* Add new threshold */

4540

/* Add new threshold */

4529

new->entries[size - 1].eventfd = eventfd;

4541

new->entries[size - 1].eventfd = eventfd;

4530

new->entries[size - 1].threshold = threshold;

4542

new->entries[size - 1].threshold = threshold;

4531

4543

4532

/* Sort thresholds. Registering of new threshold isn't time-critical */

4544

/* Sort thresholds. Registering of new threshold isn't time-critical */

4533

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4545

sort(new->entries, size, sizeof(struct mem_cgroup_threshold),

4534

compare_thresholds, NULL);

4546

compare_thresholds, NULL);

4535

4547

4536

/* Find current threshold */

4548

/* Find current threshold */

4537

new->current_threshold = -1;

4549

new->current_threshold = -1;

4538

for (i = 0; i < size; i++) {

4550

for (i = 0; i < size; i++) {

4539

if (new->entries[i].threshold <= usage) {

4551

if (new->entries[i].threshold <= usage) {

4540

/*

4552

/*

4541

* new->current_threshold will not be used until

4553

* new->current_threshold will not be used until

4542

* rcu_assign_pointer(), so it's safe to increment

4554

* rcu_assign_pointer(), so it's safe to increment

4543

* it here.

4555

* it here.

4544

*/

4556

*/

4545

++new->current_threshold;

4557

++new->current_threshold;

4546

} else

4558

} else

4547

break;

4559

break;

4548

}

4560

}

4549

4561

4550

/* Free old spare buffer and save old primary buffer as spare */

4562

/* Free old spare buffer and save old primary buffer as spare */

4551

kfree(thresholds->spare);

4563

kfree(thresholds->spare);

4552

thresholds->spare = thresholds->primary;

4564

thresholds->spare = thresholds->primary;

4553

4565

4554

rcu_assign_pointer(thresholds->primary, new);

4566

rcu_assign_pointer(thresholds->primary, new);

4555

4567

4556

/* To be sure that nobody uses thresholds */

4568

/* To be sure that nobody uses thresholds */

4557

synchronize_rcu();

4569

synchronize_rcu();

4558

4570

4559

unlock:

4571

unlock:

4560

mutex_unlock(&memcg->thresholds_lock);

4572

mutex_unlock(&memcg->thresholds_lock);

4561

4573

4562

return ret;

4574

return ret;

4563

}

4575

}

4564

4576

4565

static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

4577

static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,

4566

struct eventfd_ctx *eventfd, const char *args)

4578

struct eventfd_ctx *eventfd, const char *args)

4567

{

4579

{

4568

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);

4580

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);

4569

}

4581

}

4570

4582

4571

static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,

4583

static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,

4572

struct eventfd_ctx *eventfd, const char *args)

4584

struct eventfd_ctx *eventfd, const char *args)

4573

{

4585

{

4574

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);

4586

return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);

4575

}

4587

}

4576

4588

4577

static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

4589

static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

4578

struct eventfd_ctx *eventfd, enum res_type type)

4590

struct eventfd_ctx *eventfd, enum res_type type)

4579

{

4591

{

4580

struct mem_cgroup_thresholds *thresholds;

4592

struct mem_cgroup_thresholds *thresholds;

4581

struct mem_cgroup_threshold_ary *new;

4593

struct mem_cgroup_threshold_ary *new;

4582

unsigned long usage;

4594

unsigned long usage;

4583

int i, j, size;

4595

int i, j, size;

4584

4596

4585

mutex_lock(&memcg->thresholds_lock);

4597

mutex_lock(&memcg->thresholds_lock);

4586

4598

4587

if (type == _MEM) {

4599

if (type == _MEM) {

4588

thresholds = &memcg->thresholds;

4600

thresholds = &memcg->thresholds;

4589

usage = mem_cgroup_usage(memcg, false);

4601

usage = mem_cgroup_usage(memcg, false);

4590

} else if (type == _MEMSWAP) {

4602

} else if (type == _MEMSWAP) {

4591

thresholds = &memcg->memsw_thresholds;

4603

thresholds = &memcg->memsw_thresholds;

4592

usage = mem_cgroup_usage(memcg, true);

4604

usage = mem_cgroup_usage(memcg, true);

4593

} else

4605

} else

4594

BUG();

4606

BUG();

4595

4607

4596

if (!thresholds->primary)

4608

if (!thresholds->primary)

4597

goto unlock;

4609

goto unlock;

4598

4610

4599

/* Check if a threshold crossed before removing */

4611

/* Check if a threshold crossed before removing */

4600

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4612

__mem_cgroup_threshold(memcg, type == _MEMSWAP);

4601

4613

4602

/* Calculate new number of threshold */

4614

/* Calculate new number of threshold */

4603

size = 0;

4615

size = 0;

4604

for (i = 0; i < thresholds->primary->size; i++) {

4616

for (i = 0; i < thresholds->primary->size; i++) {

4605

if (thresholds->primary->entries[i].eventfd != eventfd)

4617

if (thresholds->primary->entries[i].eventfd != eventfd)

4606

size++;

4618

size++;

4607

}

4619

}

4608

4620

4609

new = thresholds->spare;

4621

new = thresholds->spare;

4610

4622

4611

/* Set thresholds array to NULL if we don't have thresholds */

4623

/* Set thresholds array to NULL if we don't have thresholds */

4612

if (!size) {

4624

if (!size) {

4613

kfree(new);

4625

kfree(new);

4614

new = NULL;

4626

new = NULL;

4615

goto swap_buffers;

4627

goto swap_buffers;

4616

}

4628

}

4617

4629

4618

new->size = size;

4630

new->size = size;

4619

4631

4620

/* Copy thresholds and find current threshold */

4632

/* Copy thresholds and find current threshold */

4621

new->current_threshold = -1;

4633

new->current_threshold = -1;

4622

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4634

for (i = 0, j = 0; i < thresholds->primary->size; i++) {

4623

if (thresholds->primary->entries[i].eventfd == eventfd)

4635

if (thresholds->primary->entries[i].eventfd == eventfd)

4624

continue;

4636

continue;

4625

4637

4626

new->entries[j] = thresholds->primary->entries[i];

4638

new->entries[j] = thresholds->primary->entries[i];

4627

if (new->entries[j].threshold <= usage) {

4639

if (new->entries[j].threshold <= usage) {

4628

/*

4640

/*

4629

* new->current_threshold will not be used

4641

* new->current_threshold will not be used

4630

* until rcu_assign_pointer(), so it's safe to increment

4642

* until rcu_assign_pointer(), so it's safe to increment

4631

* it here.

4643

* it here.

4632

*/

4644

*/

4633

++new->current_threshold;

4645

++new->current_threshold;

4634

}

4646

}

4635

j++;

4647

j++;

4636

}

4648

}

4637

4649

4638

swap_buffers:

4650

swap_buffers:

4639

/* Swap primary and spare array */

4651

/* Swap primary and spare array */

4640

thresholds->spare = thresholds->primary;

4652

thresholds->spare = thresholds->primary;

4641

/* If all events are unregistered, free the spare array */

4653

/* If all events are unregistered, free the spare array */

4642

if (!new) {

4654

if (!new) {

4643

kfree(thresholds->spare);

4655

kfree(thresholds->spare);

4644

thresholds->spare = NULL;

4656

thresholds->spare = NULL;

4645

}

4657

}

4646

4658

4647

rcu_assign_pointer(thresholds->primary, new);

4659

rcu_assign_pointer(thresholds->primary, new);

4648

4660

4649

/* To be sure that nobody uses thresholds */

4661

/* To be sure that nobody uses thresholds */

4650

synchronize_rcu();

4662

synchronize_rcu();

4651

unlock:

4663

unlock:

4652

mutex_unlock(&memcg->thresholds_lock);

4664

mutex_unlock(&memcg->thresholds_lock);

4653

}

4665

}

4654

4666

4655

static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

4667

static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

4656

struct eventfd_ctx *eventfd)

4668

struct eventfd_ctx *eventfd)

4657

{

4669

{

4658

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);

4670

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);

4659

}

4671

}

4660

4672

4661

static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

4673

static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,

4662

struct eventfd_ctx *eventfd)

4674

struct eventfd_ctx *eventfd)

4663

{

4675

{

4664

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);

4676

return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);

4665

}

4677

}

4666

4678

4667

static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,

4679

static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,

4668

struct eventfd_ctx *eventfd, const char *args)

4680

struct eventfd_ctx *eventfd, const char *args)

4669

{

4681

{

4670

struct mem_cgroup_eventfd_list *event;

4682

struct mem_cgroup_eventfd_list *event;

4671

4683

4672

event = kmalloc(sizeof(*event), GFP_KERNEL);

4684

event = kmalloc(sizeof(*event), GFP_KERNEL);

4673

if (!event)

4685

if (!event)

4674

return -ENOMEM;

4686

return -ENOMEM;

4675

4687

4676

spin_lock(&memcg_oom_lock);

4688

spin_lock(&memcg_oom_lock);

4677

4689

4678

event->eventfd = eventfd;

4690

event->eventfd = eventfd;

4679

list_add(&event->list, &memcg->oom_notify);

4691

list_add(&event->list, &memcg->oom_notify);

4680

4692

4681

/* already in OOM ? */

4693

/* already in OOM ? */

4682

if (atomic_read(&memcg->under_oom))

4694

if (atomic_read(&memcg->under_oom))

4683

eventfd_signal(eventfd, 1);

4695

eventfd_signal(eventfd, 1);

4684

spin_unlock(&memcg_oom_lock);

4696

spin_unlock(&memcg_oom_lock);

4685

4697

4686

return 0;

4698

return 0;

4687

}

4699

}

4688

4700

4689

static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,

4701

static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,

4690

struct eventfd_ctx *eventfd)

4702

struct eventfd_ctx *eventfd)

4691

{

4703

{

4692

struct mem_cgroup_eventfd_list *ev, *tmp;

4704

struct mem_cgroup_eventfd_list *ev, *tmp;

4693

4705

4694

spin_lock(&memcg_oom_lock);

4706

spin_lock(&memcg_oom_lock);

4695

4707

4696

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

4708

list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {

4697

if (ev->eventfd == eventfd) {

4709

if (ev->eventfd == eventfd) {

4698

list_del(&ev->list);

4710

list_del(&ev->list);

4699

kfree(ev);

4711

kfree(ev);

4700

}

4712

}

4701

}

4713

}

4702

4714

4703

spin_unlock(&memcg_oom_lock);

4715

spin_unlock(&memcg_oom_lock);

4704

}

4716

}

4705

4717

4706

static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)

4718

static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)

4707

{

4719

{

4708

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));

4720

struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));

4709

4721

4710

seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);

4722

seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);

4711

seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));

4723

seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));

4712

return 0;

4724

return 0;

4713

}

4725

}

4714

4726

4715

static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,

4727

static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,

4716

struct cftype *cft, u64 val)

4728

struct cftype *cft, u64 val)

4717

{

4729

{

4718

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4730

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4719

4731

4720

/* cannot set to root cgroup and only 0 and 1 are allowed */

4732

/* cannot set to root cgroup and only 0 and 1 are allowed */

4721

if (!css->parent || !((val == 0) || (val == 1)))

4733

if (!css->parent || !((val == 0) || (val == 1)))

4722

return -EINVAL;

4734

return -EINVAL;

4723

4735

4724

memcg->oom_kill_disable = val;

4736

memcg->oom_kill_disable = val;

4725

if (!val)

4737

if (!val)

4726

memcg_oom_recover(memcg);

4738

memcg_oom_recover(memcg);

4727

4739

4728

return 0;

4740

return 0;

4729

}

4741

}

4730

4742

4731

#ifdef CONFIG_MEMCG_KMEM

4743

#ifdef CONFIG_MEMCG_KMEM

4732

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4744

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4733

{

4745

{

4734

int ret;

4746

int ret;

4735

4747

4736

memcg->kmemcg_id = -1;

4748

memcg->kmemcg_id = -1;

4737

ret = memcg_propagate_kmem(memcg);

4749

ret = memcg_propagate_kmem(memcg);

4738

if (ret)

4750

if (ret)

4739

return ret;

4751

return ret;

4740

4752

4741

return mem_cgroup_sockets_init(memcg, ss);

4753

return mem_cgroup_sockets_init(memcg, ss);

4742

}

4754

}

4743

4755

4744

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

4756

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

4745

{

4757

{

4746

mem_cgroup_sockets_destroy(memcg);

4758

mem_cgroup_sockets_destroy(memcg);

4747

}

4759

}

4748

4760

4749

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

4761

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

4750

{

4762

{

4751

if (!memcg_kmem_is_active(memcg))

4763

if (!memcg_kmem_is_active(memcg))

4752

return;

4764

return;

4753

4765

4754

/*

4766

/*

4755

* kmem charges can outlive the cgroup. In the case of slab

4767

* kmem charges can outlive the cgroup. In the case of slab

4756

* pages, for instance, a page contain objects from various

4768

* pages, for instance, a page contain objects from various

4757

* processes. As we prevent from taking a reference for every

4769

* processes. As we prevent from taking a reference for every

4758

* such allocation we have to be careful when doing uncharge

4770

* such allocation we have to be careful when doing uncharge

4759

* (see memcg_uncharge_kmem) and here during offlining.

4771

* (see memcg_uncharge_kmem) and here during offlining.

4760

*

4772

*

4761

* The idea is that that only the _last_ uncharge which sees

4773

* The idea is that that only the _last_ uncharge which sees

4762

* the dead memcg will drop the last reference. An additional

4774

* the dead memcg will drop the last reference. An additional

4763

* reference is taken here before the group is marked dead

4775

* reference is taken here before the group is marked dead

4764

* which is then paired with css_put during uncharge resp. here.

4776

* which is then paired with css_put during uncharge resp. here.

4765

*

4777

*

4766

* Although this might sound strange as this path is called from

4778

* Although this might sound strange as this path is called from

4767

* css_offline() when the referencemight have dropped down to 0 and

4779

* css_offline() when the referencemight have dropped down to 0 and

4768

* shouldn't be incremented anymore (css_tryget_online() would

4780

* shouldn't be incremented anymore (css_tryget_online() would

4769

* fail) we do not have other options because of the kmem

4781

* fail) we do not have other options because of the kmem

4770

* allocations lifetime.

4782

* allocations lifetime.

4771

*/

4783

*/

4772

css_get(&memcg->css);

4784

css_get(&memcg->css);

4773

4785

4774

memcg_kmem_mark_dead(memcg);

4786

memcg_kmem_mark_dead(memcg);

4775

4787

4776

if (page_counter_read(&memcg->kmem))

4788

if (page_counter_read(&memcg->kmem))

4777

return;

4789

return;

4778

4790

4779

if (memcg_kmem_test_and_clear_dead(memcg))

4791

if (memcg_kmem_test_and_clear_dead(memcg))

4780

css_put(&memcg->css);

4792

css_put(&memcg->css);

4781

}

4793

}

4782

#else

4794

#else

4783

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4795

static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)

4784

{

4796

{

4785

return 0;

4797

return 0;

4786

}

4798

}

4787

4799

4788

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

4800

static void memcg_destroy_kmem(struct mem_cgroup *memcg)

4789

{

4801

{

4790

}

4802

}

4791

4803

4792

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

4804

static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)

4793

{

4805

{

4794

}

4806

}

4795

#endif

4807

#endif

4796

4808

4797

/*

4809

/*

4798

* DO NOT USE IN NEW FILES.

4810

* DO NOT USE IN NEW FILES.

4799

*

4811

*

4800

* "cgroup.event_control" implementation.

4812

* "cgroup.event_control" implementation.

4801

*

4813

*

4802

* This is way over-engineered. It tries to support fully configurable

4814

* This is way over-engineered. It tries to support fully configurable

4803

* events for each user. Such level of flexibility is completely

4815

* events for each user. Such level of flexibility is completely

4804

* unnecessary especially in the light of the planned unified hierarchy.

4816

* unnecessary especially in the light of the planned unified hierarchy.

4805

*

4817

*

4806

* Please deprecate this and replace with something simpler if at all

4818

* Please deprecate this and replace with something simpler if at all

4807

* possible.

4819

* possible.

4808

*/

4820

*/

4809

4821

4810

/*

4822

/*

4811

* Unregister event and free resources.

4823

* Unregister event and free resources.

4812

*

4824

*

4813

* Gets called from workqueue.

4825

* Gets called from workqueue.

4814

*/

4826

*/

4815

static void memcg_event_remove(struct work_struct *work)

4827

static void memcg_event_remove(struct work_struct *work)

4816

{

4828

{

4817

struct mem_cgroup_event *event =

4829

struct mem_cgroup_event *event =

4818

container_of(work, struct mem_cgroup_event, remove);

4830

container_of(work, struct mem_cgroup_event, remove);

4819

struct mem_cgroup *memcg = event->memcg;

4831

struct mem_cgroup *memcg = event->memcg;

4820

4832

4821

remove_wait_queue(event->wqh, &event->wait);

4833

remove_wait_queue(event->wqh, &event->wait);

4822

4834

4823

event->unregister_event(memcg, event->eventfd);

4835

event->unregister_event(memcg, event->eventfd);

4824

4836

4825

/* Notify userspace the event is going away. */

4837

/* Notify userspace the event is going away. */

4826

eventfd_signal(event->eventfd, 1);

4838

eventfd_signal(event->eventfd, 1);

4827

4839

4828

eventfd_ctx_put(event->eventfd);

4840

eventfd_ctx_put(event->eventfd);

4829

kfree(event);

4841

kfree(event);

4830

css_put(&memcg->css);

4842

css_put(&memcg->css);

4831

}

4843

}

4832

4844

4833

/*

4845

/*

4834

* Gets called on POLLHUP on eventfd when user closes it.

4846

* Gets called on POLLHUP on eventfd when user closes it.

4835

*

4847

*

4836

* Called with wqh->lock held and interrupts disabled.

4848

* Called with wqh->lock held and interrupts disabled.

4837

*/

4849

*/

4838

static int memcg_event_wake(wait_queue_t *wait, unsigned mode,

4850

static int memcg_event_wake(wait_queue_t *wait, unsigned mode,

4839

int sync, void *key)

4851

int sync, void *key)

4840

{

4852

{

4841

struct mem_cgroup_event *event =

4853

struct mem_cgroup_event *event =

4842

container_of(wait, struct mem_cgroup_event, wait);

4854

container_of(wait, struct mem_cgroup_event, wait);

4843

struct mem_cgroup *memcg = event->memcg;

4855

struct mem_cgroup *memcg = event->memcg;

4844

unsigned long flags = (unsigned long)key;

4856

unsigned long flags = (unsigned long)key;

4845

4857

4846

if (flags & POLLHUP) {

4858

if (flags & POLLHUP) {

4847

/*

4859

/*

4848

* If the event has been detached at cgroup removal, we

4860

* If the event has been detached at cgroup removal, we

4849

* can simply return knowing the other side will cleanup

4861

* can simply return knowing the other side will cleanup

4850

* for us.

4862

* for us.

4851

*

4863

*

4852

* We can't race against event freeing since the other

4864

* We can't race against event freeing since the other

4853

* side will require wqh->lock via remove_wait_queue(),

4865

* side will require wqh->lock via remove_wait_queue(),

4854

* which we hold.

4866

* which we hold.

4855

*/

4867

*/

4856

spin_lock(&memcg->event_list_lock);

4868

spin_lock(&memcg->event_list_lock);

4857

if (!list_empty(&event->list)) {

4869

if (!list_empty(&event->list)) {

4858

list_del_init(&event->list);

4870

list_del_init(&event->list);

4859

/*

4871

/*

4860

* We are in atomic context, but cgroup_event_remove()

4872

* We are in atomic context, but cgroup_event_remove()

4861

* may sleep, so we have to call it in workqueue.

4873

* may sleep, so we have to call it in workqueue.

4862

*/

4874

*/

4863

schedule_work(&event->remove);

4875

schedule_work(&event->remove);

4864

}

4876

}

4865

spin_unlock(&memcg->event_list_lock);

4877

spin_unlock(&memcg->event_list_lock);

4866

}

4878

}

4867

4879

4868

return 0;

4880

return 0;

4869

}

4881

}

4870

4882

4871

static void memcg_event_ptable_queue_proc(struct file *file,

4883

static void memcg_event_ptable_queue_proc(struct file *file,

4872

wait_queue_head_t *wqh, poll_table *pt)

4884

wait_queue_head_t *wqh, poll_table *pt)

4873

{

4885

{

4874

struct mem_cgroup_event *event =

4886

struct mem_cgroup_event *event =

4875

container_of(pt, struct mem_cgroup_event, pt);

4887

container_of(pt, struct mem_cgroup_event, pt);

4876

4888

4877

event->wqh = wqh;

4889

event->wqh = wqh;

4878

add_wait_queue(wqh, &event->wait);

4890

add_wait_queue(wqh, &event->wait);

4879

}

4891

}

4880

4892

4881

/*

4893

/*

4882

* DO NOT USE IN NEW FILES.

4894

* DO NOT USE IN NEW FILES.

4883

*

4895

*

4884

* Parse input and register new cgroup event handler.

4896

* Parse input and register new cgroup event handler.

4885

*

4897

*

4886

* Input must be in format '<event_fd> <control_fd> <args>'.

4898

* Input must be in format '<event_fd> <control_fd> <args>'.

4887

* Interpretation of args is defined by control file implementation.

4899

* Interpretation of args is defined by control file implementation.

4888

*/

4900

*/

4889

static ssize_t memcg_write_event_control(struct kernfs_open_file *of,

4901

static ssize_t memcg_write_event_control(struct kernfs_open_file *of,

4890

char *buf, size_t nbytes, loff_t off)

4902

char *buf, size_t nbytes, loff_t off)

4891

{

4903

{

4892

struct cgroup_subsys_state *css = of_css(of);

4904

struct cgroup_subsys_state *css = of_css(of);

4893

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4905

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

4894

struct mem_cgroup_event *event;

4906

struct mem_cgroup_event *event;

4895

struct cgroup_subsys_state *cfile_css;

4907

struct cgroup_subsys_state *cfile_css;

4896

unsigned int efd, cfd;

4908

unsigned int efd, cfd;

4897

struct fd efile;

4909

struct fd efile;

4898

struct fd cfile;

4910

struct fd cfile;

4899

const char *name;

4911

const char *name;

4900

char *endp;

4912

char *endp;

4901

int ret;

4913

int ret;

4902

4914

4903

buf = strstrip(buf);

4915

buf = strstrip(buf);

4904

4916

4905

efd = simple_strtoul(buf, &endp, 10);

4917

efd = simple_strtoul(buf, &endp, 10);

4906

if (*endp != ' ')

4918

if (*endp != ' ')

4907

return -EINVAL;

4919

return -EINVAL;

4908

buf = endp + 1;

4920

buf = endp + 1;

4909

4921

4910

cfd = simple_strtoul(buf, &endp, 10);

4922

cfd = simple_strtoul(buf, &endp, 10);

4911

if ((*endp != ' ') && (*endp != '\0'))

4923

if ((*endp != ' ') && (*endp != '\0'))

4912

return -EINVAL;

4924

return -EINVAL;

4913

buf = endp + 1;

4925

buf = endp + 1;

4914

4926

4915

event = kzalloc(sizeof(*event), GFP_KERNEL);

4927

event = kzalloc(sizeof(*event), GFP_KERNEL);

4916

if (!event)

4928

if (!event)

4917

return -ENOMEM;

4929

return -ENOMEM;

4918

4930

4919

event->memcg = memcg;

4931

event->memcg = memcg;

4920

INIT_LIST_HEAD(&event->list);

4932

INIT_LIST_HEAD(&event->list);

4921

init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);

4933

init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);

4922

init_waitqueue_func_entry(&event->wait, memcg_event_wake);

4934

init_waitqueue_func_entry(&event->wait, memcg_event_wake);

4923

INIT_WORK(&event->remove, memcg_event_remove);

4935

INIT_WORK(&event->remove, memcg_event_remove);

4924

4936

4925

efile = fdget(efd);

4937

efile = fdget(efd);

4926

if (!efile.file) {

4938

if (!efile.file) {

4927

ret = -EBADF;

4939

ret = -EBADF;

4928

goto out_kfree;

4940

goto out_kfree;

4929

}

4941

}

4930

4942

4931

event->eventfd = eventfd_ctx_fileget(efile.file);

4943

event->eventfd = eventfd_ctx_fileget(efile.file);

4932

if (IS_ERR(event->eventfd)) {

4944

if (IS_ERR(event->eventfd)) {

4933

ret = PTR_ERR(event->eventfd);

4945

ret = PTR_ERR(event->eventfd);

4934

goto out_put_efile;

4946

goto out_put_efile;

4935

}

4947

}

4936

4948

4937

cfile = fdget(cfd);

4949

cfile = fdget(cfd);

4938

if (!cfile.file) {

4950

if (!cfile.file) {

4939

ret = -EBADF;

4951

ret = -EBADF;

4940

goto out_put_eventfd;

4952

goto out_put_eventfd;

4941

}

4953

}

4942

4954

4943

/* the process need read permission on control file */

4955

/* the process need read permission on control file */

4944

/* AV: shouldn't we check that it's been opened for read instead? */

4956

/* AV: shouldn't we check that it's been opened for read instead? */

4945

ret = inode_permission(file_inode(cfile.file), MAY_READ);

4957

ret = inode_permission(file_inode(cfile.file), MAY_READ);

4946

if (ret < 0)

4958

if (ret < 0)

4947

goto out_put_cfile;

4959

goto out_put_cfile;

4948

4960

4949

/*

4961

/*

4950

* Determine the event callbacks and set them in @event. This used

4962

* Determine the event callbacks and set them in @event. This used

4951

* to be done via struct cftype but cgroup core no longer knows

4963

* to be done via struct cftype but cgroup core no longer knows

4952

* about these events. The following is crude but the whole thing

4964

* about these events. The following is crude but the whole thing

4953

* is for compatibility anyway.

4965

* is for compatibility anyway.

4954

*

4966

*

4955

* DO NOT ADD NEW FILES.

4967

* DO NOT ADD NEW FILES.

4956

*/

4968

*/

4957

name = cfile.file->f_dentry->d_name.name;

4969

name = cfile.file->f_dentry->d_name.name;

4958

4970

4959

if (!strcmp(name, "memory.usage_in_bytes")) {

4971

if (!strcmp(name, "memory.usage_in_bytes")) {

4960

event->register_event = mem_cgroup_usage_register_event;

4972

event->register_event = mem_cgroup_usage_register_event;

4961

event->unregister_event = mem_cgroup_usage_unregister_event;

4973

event->unregister_event = mem_cgroup_usage_unregister_event;

4962

} else if (!strcmp(name, "memory.oom_control")) {

4974

} else if (!strcmp(name, "memory.oom_control")) {

4963

event->register_event = mem_cgroup_oom_register_event;

4975

event->register_event = mem_cgroup_oom_register_event;

4964

event->unregister_event = mem_cgroup_oom_unregister_event;

4976

event->unregister_event = mem_cgroup_oom_unregister_event;

4965

} else if (!strcmp(name, "memory.pressure_level")) {

4977

} else if (!strcmp(name, "memory.pressure_level")) {

4966

event->register_event = vmpressure_register_event;

4978

event->register_event = vmpressure_register_event;

4967

event->unregister_event = vmpressure_unregister_event;

4979

event->unregister_event = vmpressure_unregister_event;

4968

} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {

4980

} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {

4969

event->register_event = memsw_cgroup_usage_register_event;

4981

event->register_event = memsw_cgroup_usage_register_event;

4970

event->unregister_event = memsw_cgroup_usage_unregister_event;

4982

event->unregister_event = memsw_cgroup_usage_unregister_event;

4971

} else {

4983

} else {

4972

ret = -EINVAL;

4984

ret = -EINVAL;

4973

goto out_put_cfile;

4985

goto out_put_cfile;

4974

}

4986

}

4975

4987

4976

/*

4988

/*

4977

* Verify @cfile should belong to @css. Also, remaining events are

4989

* Verify @cfile should belong to @css. Also, remaining events are

4978

* automatically removed on cgroup destruction but the removal is

4990

* automatically removed on cgroup destruction but the removal is

4979

* asynchronous, so take an extra ref on @css.

4991

* asynchronous, so take an extra ref on @css.

4980

*/

4992

*/

4981

cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,

4993

cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,

4982

&memory_cgrp_subsys);

4994

&memory_cgrp_subsys);

4983

ret = -EINVAL;

4995

ret = -EINVAL;

4984

if (IS_ERR(cfile_css))

4996

if (IS_ERR(cfile_css))

4985

goto out_put_cfile;

4997

goto out_put_cfile;

4986

if (cfile_css != css) {

4998

if (cfile_css != css) {

4987

css_put(cfile_css);

4999

css_put(cfile_css);

4988

goto out_put_cfile;

5000

goto out_put_cfile;

4989

}

5001

}

4990

5002

4991

ret = event->register_event(memcg, event->eventfd, buf);

5003

ret = event->register_event(memcg, event->eventfd, buf);

4992

if (ret)

5004

if (ret)

4993

goto out_put_css;

5005

goto out_put_css;

4994

5006

4995

efile.file->f_op->poll(efile.file, &event->pt);

5007

efile.file->f_op->poll(efile.file, &event->pt);

4996

5008

4997

spin_lock(&memcg->event_list_lock);

5009

spin_lock(&memcg->event_list_lock);

4998

list_add(&event->list, &memcg->event_list);

5010

list_add(&event->list, &memcg->event_list);

4999

spin_unlock(&memcg->event_list_lock);

5011

spin_unlock(&memcg->event_list_lock);

5000

5012

5001

fdput(cfile);

5013

fdput(cfile);

5002

fdput(efile);

5014

fdput(efile);

5003

5015

5004

return nbytes;

5016

return nbytes;

5005

5017

5006

out_put_css:

5018

out_put_css:

5007

css_put(css);

5019

css_put(css);

5008

out_put_cfile:

5020

out_put_cfile:

5009

fdput(cfile);

5021

fdput(cfile);

5010

out_put_eventfd:

5022

out_put_eventfd:

5011

eventfd_ctx_put(event->eventfd);

5023

eventfd_ctx_put(event->eventfd);

5012

out_put_efile:

5024

out_put_efile:

5013

fdput(efile);

5025

fdput(efile);

5014

out_kfree:

5026

out_kfree:

5015

kfree(event);

5027

kfree(event);

5016

5028

5017

return ret;

5029

return ret;

5018

}

5030

}

5019

5031

5020

static struct cftype mem_cgroup_files[] = {

5032

static struct cftype mem_cgroup_files[] = {

5021

{

5033

{

5022

.name = "usage_in_bytes",

5034

.name = "usage_in_bytes",

5023

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

5035

.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),

5024

.read_u64 = mem_cgroup_read_u64,

5036

.read_u64 = mem_cgroup_read_u64,

5025

},

5037

},

5026

{

5038

{

5027

.name = "max_usage_in_bytes",

5039

.name = "max_usage_in_bytes",

5028

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

5040

.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),

5029

.write = mem_cgroup_reset,

5041

.write = mem_cgroup_reset,

5030

.read_u64 = mem_cgroup_read_u64,

5042

.read_u64 = mem_cgroup_read_u64,

5031

},

5043

},

5032

{

5044

{

5033

.name = "limit_in_bytes",

5045

.name = "limit_in_bytes",

5034

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

5046

.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),

5035

.write = mem_cgroup_write,

5047

.write = mem_cgroup_write,

5036

.read_u64 = mem_cgroup_read_u64,

5048

.read_u64 = mem_cgroup_read_u64,

5037

},

5049

},

5038

{

5050

{

5039

.name = "soft_limit_in_bytes",

5051

.name = "soft_limit_in_bytes",

5040

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

5052

.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),

5041

.write = mem_cgroup_write,

5053

.write = mem_cgroup_write,

5042

.read_u64 = mem_cgroup_read_u64,

5054

.read_u64 = mem_cgroup_read_u64,

5043

},

5055

},

5044

{

5056

{

5045

.name = "failcnt",

5057

.name = "failcnt",

5046

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

5058

.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),

5047

.write = mem_cgroup_reset,

5059

.write = mem_cgroup_reset,

5048

.read_u64 = mem_cgroup_read_u64,

5060

.read_u64 = mem_cgroup_read_u64,

5049

},

5061

},

5050

{

5062

{

5051

.name = "stat",

5063

.name = "stat",

5052

.seq_show = memcg_stat_show,

5064

.seq_show = memcg_stat_show,

5053

},

5065

},

5054

{

5066

{

5055

.name = "force_empty",

5067

.name = "force_empty",

5056

.write = mem_cgroup_force_empty_write,

5068

.write = mem_cgroup_force_empty_write,

5057

},

5069

},

5058

{

5070

{

5059

.name = "use_hierarchy",

5071

.name = "use_hierarchy",

5060

.write_u64 = mem_cgroup_hierarchy_write,

5072

.write_u64 = mem_cgroup_hierarchy_write,

5061

.read_u64 = mem_cgroup_hierarchy_read,

5073

.read_u64 = mem_cgroup_hierarchy_read,

5062

},

5074

},

5063

{

5075

{

5064

.name = "cgroup.event_control", /* XXX: for compat */

5076

.name = "cgroup.event_control", /* XXX: for compat */

5065

.write = memcg_write_event_control,

5077

.write = memcg_write_event_control,

5066

.flags = CFTYPE_NO_PREFIX,

5078

.flags = CFTYPE_NO_PREFIX,

5067

.mode = S_IWUGO,

5079

.mode = S_IWUGO,

5068

},

5080

},

5069

{

5081

{

5070

.name = "swappiness",

5082

.name = "swappiness",

5071

.read_u64 = mem_cgroup_swappiness_read,

5083

.read_u64 = mem_cgroup_swappiness_read,

5072

.write_u64 = mem_cgroup_swappiness_write,

5084

.write_u64 = mem_cgroup_swappiness_write,

5073

},

5085

},

5074

{

5086

{

5075

.name = "move_charge_at_immigrate",

5087

.name = "move_charge_at_immigrate",

5076

.read_u64 = mem_cgroup_move_charge_read,

5088

.read_u64 = mem_cgroup_move_charge_read,

5077

.write_u64 = mem_cgroup_move_charge_write,

5089

.write_u64 = mem_cgroup_move_charge_write,

5078

},

5090

},

5079

{

5091

{

5080

.name = "oom_control",

5092

.name = "oom_control",

5081

.seq_show = mem_cgroup_oom_control_read,

5093

.seq_show = mem_cgroup_oom_control_read,

5082

.write_u64 = mem_cgroup_oom_control_write,

5094

.write_u64 = mem_cgroup_oom_control_write,

5083

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

5095

.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),

5084

},

5096

},

5085

{

5097

{

5086

.name = "pressure_level",

5098

.name = "pressure_level",

5087

},

5099

},

5088

#ifdef CONFIG_NUMA

5100

#ifdef CONFIG_NUMA

5089

{

5101

{

5090

.name = "numa_stat",

5102

.name = "numa_stat",

5091

.seq_show = memcg_numa_stat_show,

5103

.seq_show = memcg_numa_stat_show,

5092

},

5104

},

5093

#endif

5105

#endif

5094

#ifdef CONFIG_MEMCG_KMEM

5106

#ifdef CONFIG_MEMCG_KMEM

5095

{

5107

{

5096

.name = "kmem.limit_in_bytes",

5108

.name = "kmem.limit_in_bytes",

5097

.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),

5109

.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),

5098

.write = mem_cgroup_write,

5110

.write = mem_cgroup_write,

5099

.read_u64 = mem_cgroup_read_u64,

5111

.read_u64 = mem_cgroup_read_u64,

5100

},

5112

},

5101

{

5113

{

5102

.name = "kmem.usage_in_bytes",

5114

.name = "kmem.usage_in_bytes",

5103

.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),

5115

.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),

5104

.read_u64 = mem_cgroup_read_u64,

5116

.read_u64 = mem_cgroup_read_u64,

5105

},

5117

},

5106

{

5118

{

5107

.name = "kmem.failcnt",

5119

.name = "kmem.failcnt",

5108

.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),

5120

.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),

5109

.write = mem_cgroup_reset,

5121

.write = mem_cgroup_reset,

5110

.read_u64 = mem_cgroup_read_u64,

5122

.read_u64 = mem_cgroup_read_u64,

5111

},

5123

},

5112

{

5124

{

5113

.name = "kmem.max_usage_in_bytes",

5125

.name = "kmem.max_usage_in_bytes",

5114

.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),

5126

.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),

5115

.write = mem_cgroup_reset,

5127

.write = mem_cgroup_reset,

5116

.read_u64 = mem_cgroup_read_u64,

5128

.read_u64 = mem_cgroup_read_u64,

5117

},

5129

},

5118

#ifdef CONFIG_SLABINFO

5130

#ifdef CONFIG_SLABINFO

5119

{

5131

{

5120

.name = "kmem.slabinfo",

5132

.name = "kmem.slabinfo",

5121

.seq_show = mem_cgroup_slabinfo_read,

5133

.seq_show = mem_cgroup_slabinfo_read,

5122

},

5134

},

5123

#endif

5135

#endif

5124

#endif

5136

#endif

5125

{ }, /* terminate */

5137

{ }, /* terminate */

5126

};

5138

};

5127

5139

5128

#ifdef CONFIG_MEMCG_SWAP

5140

#ifdef CONFIG_MEMCG_SWAP

5129

static struct cftype memsw_cgroup_files[] = {

5141

static struct cftype memsw_cgroup_files[] = {

5130

{

5142

{

5131

.name = "memsw.usage_in_bytes",

5143

.name = "memsw.usage_in_bytes",

5132

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

5144

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),

5133

.read_u64 = mem_cgroup_read_u64,

5145

.read_u64 = mem_cgroup_read_u64,

5134

},

5146

},

5135

{

5147

{

5136

.name = "memsw.max_usage_in_bytes",

5148

.name = "memsw.max_usage_in_bytes",

5137

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

5149

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),

5138

.write = mem_cgroup_reset,

5150

.write = mem_cgroup_reset,

5139

.read_u64 = mem_cgroup_read_u64,

5151

.read_u64 = mem_cgroup_read_u64,

5140

},

5152

},

5141

{

5153

{

5142

.name = "memsw.limit_in_bytes",

5154

.name = "memsw.limit_in_bytes",

5143

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

5155

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),

5144

.write = mem_cgroup_write,

5156

.write = mem_cgroup_write,

5145

.read_u64 = mem_cgroup_read_u64,

5157

.read_u64 = mem_cgroup_read_u64,

5146

},

5158

},

5147

{

5159

{

5148

.name = "memsw.failcnt",

5160

.name = "memsw.failcnt",

5149

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

5161

.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),

5150

.write = mem_cgroup_reset,

5162

.write = mem_cgroup_reset,

5151

.read_u64 = mem_cgroup_read_u64,

5163

.read_u64 = mem_cgroup_read_u64,

5152

},

5164

},

5153

{ }, /* terminate */

5165

{ }, /* terminate */

5154

};

5166

};

5155

#endif

5167

#endif

5156

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

5168

static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

5157

{

5169

{

5158

struct mem_cgroup_per_node *pn;

5170

struct mem_cgroup_per_node *pn;

5159

struct mem_cgroup_per_zone *mz;

5171

struct mem_cgroup_per_zone *mz;

5160

int zone, tmp = node;

5172

int zone, tmp = node;

5161

/*

5173

/*

5162

* This routine is called against possible nodes.

5174

* This routine is called against possible nodes.

5163

* But it's BUG to call kmalloc() against offline node.

5175

* But it's BUG to call kmalloc() against offline node.

5164

*

5176

*

5165

* TODO: this routine can waste much memory for nodes which will

5177

* TODO: this routine can waste much memory for nodes which will

5166

* never be onlined. It's better to use memory hotplug callback

5178

* never be onlined. It's better to use memory hotplug callback

5167

* function.

5179

* function.

5168

*/

5180

*/

5169

if (!node_state(node, N_NORMAL_MEMORY))

5181

if (!node_state(node, N_NORMAL_MEMORY))

5170

tmp = -1;

5182

tmp = -1;

5171

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

5183

pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);

5172

if (!pn)

5184

if (!pn)

5173

return 1;

5185

return 1;

5174

5186

5175

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

5187

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

5176

mz = &pn->zoneinfo[zone];

5188

mz = &pn->zoneinfo[zone];

5177

lruvec_init(&mz->lruvec);

5189

lruvec_init(&mz->lruvec);

5178

mz->usage_in_excess = 0;

5190

mz->usage_in_excess = 0;

5179

mz->on_tree = false;

5191

mz->on_tree = false;

5180

mz->memcg = memcg;

5192

mz->memcg = memcg;

5181

}

5193

}

5182

memcg->nodeinfo[node] = pn;

5194

memcg->nodeinfo[node] = pn;

5183

return 0;

5195

return 0;

5184

}

5196

}

5185

5197

5186

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

5198

static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)

5187

{

5199

{

5188

kfree(memcg->nodeinfo[node]);

5200

kfree(memcg->nodeinfo[node]);

5189

}

5201

}

5190

5202

5191

static struct mem_cgroup *mem_cgroup_alloc(void)

5203

static struct mem_cgroup *mem_cgroup_alloc(void)

5192

{

5204

{

5193

struct mem_cgroup *memcg;

5205

struct mem_cgroup *memcg;

5194

size_t size;

5206

size_t size;

5195

5207

5196

size = sizeof(struct mem_cgroup);

5208

size = sizeof(struct mem_cgroup);

5197

size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);

5209

size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);

5198

5210

5199

memcg = kzalloc(size, GFP_KERNEL);

5211

memcg = kzalloc(size, GFP_KERNEL);

5200

if (!memcg)

5212

if (!memcg)

5201

return NULL;

5213

return NULL;

5202

5214

5203

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

5215

memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);

5204

if (!memcg->stat)

5216

if (!memcg->stat)

5205

goto out_free;

5217

goto out_free;

5206

spin_lock_init(&memcg->pcp_counter_lock);

5218

spin_lock_init(&memcg->pcp_counter_lock);

5207

return memcg;

5219

return memcg;

5208

5220

5209

out_free:

5221

out_free:

5210

kfree(memcg);

5222

kfree(memcg);

5211

return NULL;

5223

return NULL;

5212

}

5224

}

5213

5225

5214

/*

5226

/*

5215

* At destroying mem_cgroup, references from swap_cgroup can remain.

5227

* At destroying mem_cgroup, references from swap_cgroup can remain.

5216

* (scanning all at force_empty is too costly...)

5228

* (scanning all at force_empty is too costly...)

5217

*

5229

*

5218

* Instead of clearing all references at force_empty, we remember

5230

* Instead of clearing all references at force_empty, we remember

5219

* the number of reference from swap_cgroup and free mem_cgroup when

5231

* the number of reference from swap_cgroup and free mem_cgroup when

5220

* it goes down to 0.

5232

* it goes down to 0.

5221

*

5233

*

5222

* Removal of cgroup itself succeeds regardless of refs from swap.

5234

* Removal of cgroup itself succeeds regardless of refs from swap.

5223

*/

5235

*/

5224

5236

5225

static void __mem_cgroup_free(struct mem_cgroup *memcg)

5237

static void __mem_cgroup_free(struct mem_cgroup *memcg)

5226

{

5238

{

5227

int node;

5239

int node;

5228

5240

5229

mem_cgroup_remove_from_trees(memcg);

5241

mem_cgroup_remove_from_trees(memcg);

5230

5242

5231

for_each_node(node)

5243

for_each_node(node)

5232

free_mem_cgroup_per_zone_info(memcg, node);

5244

free_mem_cgroup_per_zone_info(memcg, node);

5233

5245

5234

free_percpu(memcg->stat);

5246

free_percpu(memcg->stat);

5235

5247

5236

/*

5248

/*

5237

* We need to make sure that (at least for now), the jump label

5249

* We need to make sure that (at least for now), the jump label

5238

* destruction code runs outside of the cgroup lock. This is because

5250

* destruction code runs outside of the cgroup lock. This is because

5239

* get_online_cpus(), which is called from the static_branch update,

5251

* get_online_cpus(), which is called from the static_branch update,

5240

* can't be called inside the cgroup_lock. cpusets are the ones

5252

* can't be called inside the cgroup_lock. cpusets are the ones

5241

* enforcing this dependency, so if they ever change, we might as well.

5253

* enforcing this dependency, so if they ever change, we might as well.

5242

*

5254

*

5243

* schedule_work() will guarantee this happens. Be careful if you need

5255

* schedule_work() will guarantee this happens. Be careful if you need

5244

* to move this code around, and make sure it is outside

5256

* to move this code around, and make sure it is outside

5245

* the cgroup_lock.

5257

* the cgroup_lock.

5246

*/

5258

*/

5247

disarm_static_keys(memcg);

5259

disarm_static_keys(memcg);

5248

kfree(memcg);

5260

kfree(memcg);

5249

}

5261

}

5250

5262

5251

/*

5263

/*

5252

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

5264

* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.

5253

*/

5265

*/

5254

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

5266

struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)

5255

{

5267

{

5256

if (!memcg->memory.parent)

5268

if (!memcg->memory.parent)

5257

return NULL;

5269

return NULL;

5258

return mem_cgroup_from_counter(memcg->memory.parent, memory);

5270

return mem_cgroup_from_counter(memcg->memory.parent, memory);

5259

}

5271

}

5260

EXPORT_SYMBOL(parent_mem_cgroup);

5272

EXPORT_SYMBOL(parent_mem_cgroup);

5261

5273

5262

static void __init mem_cgroup_soft_limit_tree_init(void)

5274

static void __init mem_cgroup_soft_limit_tree_init(void)

5263

{

5275

{

5264

struct mem_cgroup_tree_per_node *rtpn;

5276

struct mem_cgroup_tree_per_node *rtpn;

5265

struct mem_cgroup_tree_per_zone *rtpz;

5277

struct mem_cgroup_tree_per_zone *rtpz;

5266

int tmp, node, zone;

5278

int tmp, node, zone;

5267

5279

5268

for_each_node(node) {

5280

for_each_node(node) {

5269

tmp = node;

5281

tmp = node;

5270

if (!node_state(node, N_NORMAL_MEMORY))

5282

if (!node_state(node, N_NORMAL_MEMORY))

5271

tmp = -1;

5283

tmp = -1;

5272

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

5284

rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);

5273

BUG_ON(!rtpn);

5285

BUG_ON(!rtpn);

5274

5286

5275

soft_limit_tree.rb_tree_per_node[node] = rtpn;

5287

soft_limit_tree.rb_tree_per_node[node] = rtpn;

5276

5288

5277

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

5289

for (zone = 0; zone < MAX_NR_ZONES; zone++) {

5278

rtpz = &rtpn->rb_tree_per_zone[zone];

5290

rtpz = &rtpn->rb_tree_per_zone[zone];

5279

rtpz->rb_root = RB_ROOT;

5291

rtpz->rb_root = RB_ROOT;

5280

spin_lock_init(&rtpz->lock);

5292

spin_lock_init(&rtpz->lock);

5281

}

5293

}

5282

}

5294

}

5283

}

5295

}

5284

5296

5285

static struct cgroup_subsys_state * __ref

5297

static struct cgroup_subsys_state * __ref

5286

mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

5298

mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

5287

{

5299

{

5288

struct mem_cgroup *memcg;

5300

struct mem_cgroup *memcg;

5289

long error = -ENOMEM;

5301

long error = -ENOMEM;

5290

int node;

5302

int node;

5291

5303

5292

memcg = mem_cgroup_alloc();

5304

memcg = mem_cgroup_alloc();

5293

if (!memcg)

5305

if (!memcg)

5294

return ERR_PTR(error);

5306

return ERR_PTR(error);

5295

5307

5296

for_each_node(node)

5308

for_each_node(node)

5297

if (alloc_mem_cgroup_per_zone_info(memcg, node))

5309

if (alloc_mem_cgroup_per_zone_info(memcg, node))

5298

goto free_out;

5310

goto free_out;

5299

5311

5300

/* root ? */

5312

/* root ? */

5301

if (parent_css == NULL) {

5313

if (parent_css == NULL) {

5302

root_mem_cgroup = memcg;

5314

root_mem_cgroup = memcg;

5303

page_counter_init(&memcg->memory, NULL);

5315

page_counter_init(&memcg->memory, NULL);

5304

page_counter_init(&memcg->memsw, NULL);

5316

page_counter_init(&memcg->memsw, NULL);

5305

page_counter_init(&memcg->kmem, NULL);

5317

page_counter_init(&memcg->kmem, NULL);

5306

}

5318

}

5307

5319

5308

memcg->last_scanned_node = MAX_NUMNODES;

5320

memcg->last_scanned_node = MAX_NUMNODES;

5309

INIT_LIST_HEAD(&memcg->oom_notify);

5321

INIT_LIST_HEAD(&memcg->oom_notify);

5310

memcg->move_charge_at_immigrate = 0;

5322

memcg->move_charge_at_immigrate = 0;

5311

mutex_init(&memcg->thresholds_lock);

5323

mutex_init(&memcg->thresholds_lock);

5312

spin_lock_init(&memcg->move_lock);

5324

spin_lock_init(&memcg->move_lock);

5313

vmpressure_init(&memcg->vmpressure);

5325

vmpressure_init(&memcg->vmpressure);

5314

INIT_LIST_HEAD(&memcg->event_list);

5326

INIT_LIST_HEAD(&memcg->event_list);

5315

spin_lock_init(&memcg->event_list_lock);

5327

spin_lock_init(&memcg->event_list_lock);

5316

5328

5317

return &memcg->css;

5329

return &memcg->css;

5318

5330

5319

free_out:

5331

free_out:

5320

__mem_cgroup_free(memcg);

5332

__mem_cgroup_free(memcg);

5321

return ERR_PTR(error);

5333

return ERR_PTR(error);

5322

}

5334

}

5323

5335

5324

static int

5336

static int

5325

mem_cgroup_css_online(struct cgroup_subsys_state *css)

5337

mem_cgroup_css_online(struct cgroup_subsys_state *css)

5326

{

5338

{

5327

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5339

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5328

struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);

5340

struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);

5329

int ret;

5341

int ret;

5330

5342

5331

if (css->id > MEM_CGROUP_ID_MAX)

5343

if (css->id > MEM_CGROUP_ID_MAX)

5332

return -ENOSPC;

5344

return -ENOSPC;

5333

5345

5334

if (!parent)

5346

if (!parent)

5335

return 0;

5347

return 0;

5336

5348

5337

mutex_lock(&memcg_create_mutex);

5349

mutex_lock(&memcg_create_mutex);

5338

5350

5339

memcg->use_hierarchy = parent->use_hierarchy;

5351

memcg->use_hierarchy = parent->use_hierarchy;

5340

memcg->oom_kill_disable = parent->oom_kill_disable;

5352

memcg->oom_kill_disable = parent->oom_kill_disable;

5341

memcg->swappiness = mem_cgroup_swappiness(parent);

5353

memcg->swappiness = mem_cgroup_swappiness(parent);

5342

5354

5343

if (parent->use_hierarchy) {

5355

if (parent->use_hierarchy) {

5344

page_counter_init(&memcg->memory, &parent->memory);

5356

page_counter_init(&memcg->memory, &parent->memory);

5345

page_counter_init(&memcg->memsw, &parent->memsw);

5357

page_counter_init(&memcg->memsw, &parent->memsw);

5346

page_counter_init(&memcg->kmem, &parent->kmem);

5358

page_counter_init(&memcg->kmem, &parent->kmem);

5347

5359

5348

/*

5360

/*

5349

* No need to take a reference to the parent because cgroup

5361

* No need to take a reference to the parent because cgroup

5350

* core guarantees its existence.

5362

* core guarantees its existence.

5351

*/

5363

*/

5352

} else {

5364

} else {

5353

page_counter_init(&memcg->memory, NULL);

5365

page_counter_init(&memcg->memory, NULL);

5354

page_counter_init(&memcg->memsw, NULL);

5366

page_counter_init(&memcg->memsw, NULL);

5355

page_counter_init(&memcg->kmem, NULL);

5367

page_counter_init(&memcg->kmem, NULL);

5356

/*

5368

/*

5357

* Deeper hierachy with use_hierarchy == false doesn't make

5369

* Deeper hierachy with use_hierarchy == false doesn't make

5358

* much sense so let cgroup subsystem know about this

5370

* much sense so let cgroup subsystem know about this

5359

* unfortunate state in our controller.

5371

* unfortunate state in our controller.

5360

*/

5372

*/

5361

if (parent != root_mem_cgroup)

5373

if (parent != root_mem_cgroup)

5362

memory_cgrp_subsys.broken_hierarchy = true;

5374

memory_cgrp_subsys.broken_hierarchy = true;

5363

}

5375

}

5364

mutex_unlock(&memcg_create_mutex);

5376

mutex_unlock(&memcg_create_mutex);

5365

5377

5366

ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);

5378

ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);

5367

if (ret)

5379

if (ret)

5368

return ret;

5380

return ret;

5369

5381

5370

/*

5382

/*

5371

* Make sure the memcg is initialized: mem_cgroup_iter()

5383

* Make sure the memcg is initialized: mem_cgroup_iter()

5372

* orders reading memcg->initialized against its callers

5384

* orders reading memcg->initialized against its callers

5373

* reading the memcg members.

5385

* reading the memcg members.

5374

*/

5386

*/

5375

smp_store_release(&memcg->initialized, 1);

5387

smp_store_release(&memcg->initialized, 1);

5376

5388

5377

return 0;

5389

return 0;

5378

}

5390

}

5379

5391

5380

static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)

5392

static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)

5381

{

5393

{

5382

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5394

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5383

struct mem_cgroup_event *event, *tmp;

5395

struct mem_cgroup_event *event, *tmp;

5384

struct cgroup_subsys_state *iter;

5396

struct cgroup_subsys_state *iter;

5385

5397

5386

/*

5398

/*

5387

* Unregister events and notify userspace.

5399

* Unregister events and notify userspace.

5388

* Notify userspace about cgroup removing only after rmdir of cgroup

5400

* Notify userspace about cgroup removing only after rmdir of cgroup

5389

* directory to avoid race between userspace and kernelspace.

5401

* directory to avoid race between userspace and kernelspace.

5390

*/

5402

*/

5391

spin_lock(&memcg->event_list_lock);

5403

spin_lock(&memcg->event_list_lock);

5392

list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {

5404

list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {

5393

list_del_init(&event->list);

5405

list_del_init(&event->list);

5394

schedule_work(&event->remove);

5406

schedule_work(&event->remove);

5395

}

5407

}

5396

spin_unlock(&memcg->event_list_lock);

5408

spin_unlock(&memcg->event_list_lock);

5397

5409

5398

kmem_cgroup_css_offline(memcg);

5410

kmem_cgroup_css_offline(memcg);

5399

5411

5400

/*

5412

/*

5401

* This requires that offlining is serialized. Right now that is

5413

* This requires that offlining is serialized. Right now that is

5402

* guaranteed because css_killed_work_fn() holds the cgroup_mutex.

5414

* guaranteed because css_killed_work_fn() holds the cgroup_mutex.

5403

*/

5415

*/

5404

css_for_each_descendant_post(iter, css)

5416

css_for_each_descendant_post(iter, css)

5405

mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));

5417

mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));

5406

5418

5407

memcg_unregister_all_caches(memcg);

5419

memcg_unregister_all_caches(memcg);

5408

vmpressure_cleanup(&memcg->vmpressure);

5420

vmpressure_cleanup(&memcg->vmpressure);

5409

}

5421

}

5410

5422

5411

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

5423

static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

5412

{

5424

{

5413

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5425

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5414

/*

5426

/*

5415

* XXX: css_offline() would be where we should reparent all

5427

* XXX: css_offline() would be where we should reparent all

5416

* memory to prepare the cgroup for destruction. However,

5428

* memory to prepare the cgroup for destruction. However,

5417

* memcg does not do css_tryget_online() and page_counter charging

5429

* memcg does not do css_tryget_online() and page_counter charging

5418

* under the same RCU lock region, which means that charging

5430

* under the same RCU lock region, which means that charging

5419

* could race with offlining. Offlining only happens to

5431

* could race with offlining. Offlining only happens to

5420

* cgroups with no tasks in them but charges can show up

5432

* cgroups with no tasks in them but charges can show up

5421

* without any tasks from the swapin path when the target

5433

* without any tasks from the swapin path when the target

5422

* memcg is looked up from the swapout record and not from the

5434

* memcg is looked up from the swapout record and not from the

5423

* current task as it usually is. A race like this can leak

5435

* current task as it usually is. A race like this can leak

5424

* charges and put pages with stale cgroup pointers into

5436

* charges and put pages with stale cgroup pointers into

5425

* circulation:

5437

* circulation:

5426

*

5438

*

5427

* #0 #1

5439

* #0 #1

5428

* lookup_swap_cgroup_id()

5440

* lookup_swap_cgroup_id()

5429

* rcu_read_lock()

5441

* rcu_read_lock()

5430

* mem_cgroup_lookup()

5442

* mem_cgroup_lookup()

5431

* css_tryget_online()

5443

* css_tryget_online()

5432

* rcu_read_unlock()

5444

* rcu_read_unlock()

5433

* disable css_tryget_online()

5445

* disable css_tryget_online()

5434

* call_rcu()

5446

* call_rcu()

5435

* offline_css()

5447

* offline_css()

5436

* reparent_charges()

5448

* reparent_charges()

5437

* page_counter_try_charge()

5449

* page_counter_try_charge()

5438

* css_put()

5450

* css_put()

5439

* css_free()

5451

* css_free()

5440

* pc->mem_cgroup = dead memcg

5452

* pc->mem_cgroup = dead memcg

5441

* add page to lru

5453

* add page to lru

5442

*

5454

*

5443

* The bulk of the charges are still moved in offline_css() to

5455

* The bulk of the charges are still moved in offline_css() to

5444

* avoid pinning a lot of pages in case a long-term reference

5456

* avoid pinning a lot of pages in case a long-term reference

5445

* like a swapout record is deferring the css_free() to long

5457

* like a swapout record is deferring the css_free() to long

5446

* after offlining. But this makes sure we catch any charges

5458

* after offlining. But this makes sure we catch any charges

5447

* made after offlining:

5459

* made after offlining:

5448

*/

5460

*/

5449

mem_cgroup_reparent_charges(memcg);

5461

mem_cgroup_reparent_charges(memcg);

5450

5462

5451

memcg_destroy_kmem(memcg);

5463

memcg_destroy_kmem(memcg);

5452

__mem_cgroup_free(memcg);

5464

__mem_cgroup_free(memcg);

5453

}

5465

}

5454

5466

5455

/**

5467

/**

5456

* mem_cgroup_css_reset - reset the states of a mem_cgroup

5468

* mem_cgroup_css_reset - reset the states of a mem_cgroup

5457

* @css: the target css

5469

* @css: the target css

5458

*

5470

*

5459

* Reset the states of the mem_cgroup associated with @css. This is

5471

* Reset the states of the mem_cgroup associated with @css. This is

5460

* invoked when the userland requests disabling on the default hierarchy

5472

* invoked when the userland requests disabling on the default hierarchy

5461

* but the memcg is pinned through dependency. The memcg should stop

5473

* but the memcg is pinned through dependency. The memcg should stop

5462

* applying policies and should revert to the vanilla state as it may be

5474

* applying policies and should revert to the vanilla state as it may be

5463

* made visible again.

5475

* made visible again.

5464

*

5476

*

5465

* The current implementation only resets the essential configurations.

5477

* The current implementation only resets the essential configurations.

5466

* This needs to be expanded to cover all the visible parts.

5478

* This needs to be expanded to cover all the visible parts.

5467

*/

5479

*/

5468

static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)

5480

static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)

5469

{

5481

{

5470

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5482

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5471

5483

5472

mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);

5484

mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);

5473

mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);

5485

mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);

5474

memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);

5486

memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);

5475

memcg->soft_limit = 0;

5487

memcg->soft_limit = 0;

5476

}

5488

}

5477

5489

5478

#ifdef CONFIG_MMU

5490

#ifdef CONFIG_MMU

5479

/* Handlers for move charge at task migration. */

5491

/* Handlers for move charge at task migration. */

5480

static int mem_cgroup_do_precharge(unsigned long count)

5492

static int mem_cgroup_do_precharge(unsigned long count)

5481

{

5493

{

5482

int ret;

5494

int ret;

5483

5495

5484

/* Try a single bulk charge without reclaim first */

5496

/* Try a single bulk charge without reclaim first */

5485

ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);

5497

ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);

5486

if (!ret) {

5498

if (!ret) {

5487

mc.precharge += count;

5499

mc.precharge += count;

5488

return ret;

5500

return ret;

5489

}

5501

}

5490

if (ret == -EINTR) {

5502

if (ret == -EINTR) {

5491

cancel_charge(root_mem_cgroup, count);

5503

cancel_charge(root_mem_cgroup, count);

5492

return ret;

5504

return ret;

5493

}

5505

}

5494

5506

5495

/* Try charges one by one with reclaim */

5507

/* Try charges one by one with reclaim */

5496

while (count--) {

5508

while (count--) {

5497

ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);

5509

ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);

5498

/*

5510

/*

5499

* In case of failure, any residual charges against

5511

* In case of failure, any residual charges against

5500

* mc.to will be dropped by mem_cgroup_clear_mc()

5512

* mc.to will be dropped by mem_cgroup_clear_mc()

5501

* later on. However, cancel any charges that are

5513

* later on. However, cancel any charges that are

5502

* bypassed to root right away or they'll be lost.

5514

* bypassed to root right away or they'll be lost.

5503

*/

5515

*/

5504

if (ret == -EINTR)

5516

if (ret == -EINTR)

5505

cancel_charge(root_mem_cgroup, 1);

5517

cancel_charge(root_mem_cgroup, 1);

5506

if (ret)

5518

if (ret)

5507

return ret;

5519

return ret;

5508

mc.precharge++;

5520

mc.precharge++;

5509

cond_resched();

5521

cond_resched();

5510

}

5522

}

5511

return 0;

5523

return 0;

5512

}

5524

}

5513

5525

5514

/**

5526

/**

5515

* get_mctgt_type - get target type of moving charge

5527

* get_mctgt_type - get target type of moving charge

5516

* @vma: the vma the pte to be checked belongs

5528

* @vma: the vma the pte to be checked belongs

5517

* @addr: the address corresponding to the pte to be checked

5529

* @addr: the address corresponding to the pte to be checked

5518

* @ptent: the pte to be checked

5530

* @ptent: the pte to be checked

5519

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5531

* @target: the pointer the target page or swap ent will be stored(can be NULL)

5520

*

5532

*

5521

* Returns

5533

* Returns

5522

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5534

* 0(MC_TARGET_NONE): if the pte is not a target for move charge.

5523

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5535

* 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for

5524

* move charge. if @target is not NULL, the page is stored in target->page

5536

* move charge. if @target is not NULL, the page is stored in target->page

5525

* with extra refcnt got(Callers should handle it).

5537

* with extra refcnt got(Callers should handle it).

5526

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5538

* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a

5527

* target for charge migration. if @target is not NULL, the entry is stored

5539

* target for charge migration. if @target is not NULL, the entry is stored

5528

* in target->ent.

5540

* in target->ent.

5529

*

5541

*

5530

* Called with pte lock held.

5542

* Called with pte lock held.

5531

*/

5543

*/

5532

union mc_target {

5544

union mc_target {

5533

struct page *page;

5545

struct page *page;

5534

swp_entry_t ent;

5546

swp_entry_t ent;

5535

};

5547

};

5536

5548

5537

enum mc_target_type {

5549

enum mc_target_type {

5538

MC_TARGET_NONE = 0,

5550

MC_TARGET_NONE = 0,

5539

MC_TARGET_PAGE,

5551

MC_TARGET_PAGE,

5540

MC_TARGET_SWAP,

5552

MC_TARGET_SWAP,

5541

};

5553

};

5542

5554

5543

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5555

static struct page *mc_handle_present_pte(struct vm_area_struct *vma,

5544

unsigned long addr, pte_t ptent)

5556

unsigned long addr, pte_t ptent)

5545

{

5557

{

5546

struct page *page = vm_normal_page(vma, addr, ptent);

5558

struct page *page = vm_normal_page(vma, addr, ptent);

5547

5559

5548

if (!page || !page_mapped(page))

5560

if (!page || !page_mapped(page))

5549

return NULL;

5561

return NULL;

5550

if (PageAnon(page)) {

5562

if (PageAnon(page)) {

5551

/* we don't move shared anon */

5563

/* we don't move shared anon */

5552

if (!move_anon())

5564

if (!move_anon())

5553

return NULL;

5565

return NULL;

5554

} else if (!move_file())

5566

} else if (!move_file())

5555

/* we ignore mapcount for file pages */

5567

/* we ignore mapcount for file pages */

5556

return NULL;

5568

return NULL;

5557

if (!get_page_unless_zero(page))

5569

if (!get_page_unless_zero(page))

5558

return NULL;

5570

return NULL;

5559

5571

5560

return page;

5572

return page;

5561

}

5573

}

5562

5574

5563

#ifdef CONFIG_SWAP

5575

#ifdef CONFIG_SWAP

5564

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5576

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5565

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5577

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5566

{

5578

{

5567

struct page *page = NULL;

5579

struct page *page = NULL;

5568

swp_entry_t ent = pte_to_swp_entry(ptent);

5580

swp_entry_t ent = pte_to_swp_entry(ptent);

5569

5581

5570

if (!move_anon() || non_swap_entry(ent))

5582

if (!move_anon() || non_swap_entry(ent))

5571

return NULL;

5583

return NULL;

5572

/*

5584

/*

5573

* Because lookup_swap_cache() updates some statistics counter,

5585

* Because lookup_swap_cache() updates some statistics counter,

5574

* we call find_get_page() with swapper_space directly.

5586

* we call find_get_page() with swapper_space directly.

5575

*/

5587

*/

5576

page = find_get_page(swap_address_space(ent), ent.val);

5588

page = find_get_page(swap_address_space(ent), ent.val);

5577

if (do_swap_account)

5589

if (do_swap_account)

5578

entry->val = ent.val;

5590

entry->val = ent.val;

5579

5591

5580

return page;

5592

return page;

5581

}

5593

}

5582

#else

5594

#else

5583

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5595

static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,

5584

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5596

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5585

{

5597

{

5586

return NULL;

5598

return NULL;

5587

}

5599

}

5588

#endif

5600

#endif

5589

5601

5590

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5602

static struct page *mc_handle_file_pte(struct vm_area_struct *vma,

5591

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5603

unsigned long addr, pte_t ptent, swp_entry_t *entry)

5592

{

5604

{

5593

struct page *page = NULL;

5605

struct page *page = NULL;

5594

struct address_space *mapping;

5606

struct address_space *mapping;

5595

pgoff_t pgoff;

5607

pgoff_t pgoff;

5596

5608

5597

if (!vma->vm_file) /* anonymous vma */

5609

if (!vma->vm_file) /* anonymous vma */

5598

return NULL;

5610

return NULL;

5599

if (!move_file())

5611

if (!move_file())

5600

return NULL;

5612

return NULL;

5601

5613

5602

mapping = vma->vm_file->f_mapping;

5614

mapping = vma->vm_file->f_mapping;

5603

if (pte_none(ptent))

5615

if (pte_none(ptent))

5604

pgoff = linear_page_index(vma, addr);

5616

pgoff = linear_page_index(vma, addr);

5605

else /* pte_file(ptent) is true */

5617

else /* pte_file(ptent) is true */

5606

pgoff = pte_to_pgoff(ptent);

5618

pgoff = pte_to_pgoff(ptent);

5607

5619

5608

/* page is moved even if it's not RSS of this task(page-faulted). */

5620

/* page is moved even if it's not RSS of this task(page-faulted). */

5609

#ifdef CONFIG_SWAP

5621

#ifdef CONFIG_SWAP

5610

/* shmem/tmpfs may report page out on swap: account for that too. */

5622

/* shmem/tmpfs may report page out on swap: account for that too. */

5611

if (shmem_mapping(mapping)) {

5623

if (shmem_mapping(mapping)) {

5612

page = find_get_entry(mapping, pgoff);

5624

page = find_get_entry(mapping, pgoff);

5613

if (radix_tree_exceptional_entry(page)) {

5625

if (radix_tree_exceptional_entry(page)) {

5614

swp_entry_t swp = radix_to_swp_entry(page);

5626

swp_entry_t swp = radix_to_swp_entry(page);

5615

if (do_swap_account)

5627

if (do_swap_account)

5616

*entry = swp;

5628

*entry = swp;

5617

page = find_get_page(swap_address_space(swp), swp.val);

5629

page = find_get_page(swap_address_space(swp), swp.val);

5618

}

5630

}

5619

} else

5631

} else

5620

page = find_get_page(mapping, pgoff);

5632

page = find_get_page(mapping, pgoff);

5621

#else

5633

#else

5622

page = find_get_page(mapping, pgoff);

5634

page = find_get_page(mapping, pgoff);

5623

#endif

5635

#endif

5624

return page;

5636

return page;

5625

}

5637

}

5626

5638

5627

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

5639

static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,

5628

unsigned long addr, pte_t ptent, union mc_target *target)

5640

unsigned long addr, pte_t ptent, union mc_target *target)

5629

{

5641

{

5630

struct page *page = NULL;

5642

struct page *page = NULL;

5631

struct page_cgroup *pc;

5643

struct page_cgroup *pc;

5632

enum mc_target_type ret = MC_TARGET_NONE;

5644

enum mc_target_type ret = MC_TARGET_NONE;

5633

swp_entry_t ent = { .val = 0 };

5645

swp_entry_t ent = { .val = 0 };

5634

5646

5635

if (pte_present(ptent))

5647

if (pte_present(ptent))

5636

page = mc_handle_present_pte(vma, addr, ptent);

5648

page = mc_handle_present_pte(vma, addr, ptent);

5637

else if (is_swap_pte(ptent))

5649

else if (is_swap_pte(ptent))

5638

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5650

page = mc_handle_swap_pte(vma, addr, ptent, &ent);

5639

else if (pte_none(ptent) || pte_file(ptent))

5651

else if (pte_none(ptent) || pte_file(ptent))

5640

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5652

page = mc_handle_file_pte(vma, addr, ptent, &ent);

5641

5653

5642

if (!page && !ent.val)

5654

if (!page && !ent.val)

5643

return ret;

5655

return ret;

5644

if (page) {

5656

if (page) {

5645

pc = lookup_page_cgroup(page);

5657

pc = lookup_page_cgroup(page);

5646

/*

5658

/*

5647

* Do only loose check w/o serialization.

5659

* Do only loose check w/o serialization.

5648

* mem_cgroup_move_account() checks the pc is valid or

5660

* mem_cgroup_move_account() checks the pc is valid or

5649

* not under LRU exclusion.

5661

* not under LRU exclusion.

5650

*/

5662

*/

5651

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5663

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5652

ret = MC_TARGET_PAGE;

5664

ret = MC_TARGET_PAGE;

5653

if (target)

5665

if (target)

5654

target->page = page;

5666

target->page = page;

5655

}

5667

}

5656

if (!ret || !target)

5668

if (!ret || !target)

5657

put_page(page);

5669

put_page(page);

5658

}

5670

}

5659

/* There is a swap entry and a page doesn't exist or isn't charged */

5671

/* There is a swap entry and a page doesn't exist or isn't charged */

5660

if (ent.val && !ret &&

5672

if (ent.val && !ret &&

5661

mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {

5673

mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {

5662

ret = MC_TARGET_SWAP;

5674

ret = MC_TARGET_SWAP;

5663

if (target)

5675

if (target)

5664

target->ent = ent;

5676

target->ent = ent;

5665

}

5677

}

5666

return ret;

5678

return ret;

5667

}

5679

}

5668

5680

5669

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5681

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

5670

/*

5682

/*

5671

* We don't consider swapping or file mapped pages because THP does not

5683

* We don't consider swapping or file mapped pages because THP does not

5672

* support them for now.

5684

* support them for now.

5673

* Caller should make sure that pmd_trans_huge(pmd) is true.

5685

* Caller should make sure that pmd_trans_huge(pmd) is true.

5674

*/

5686

*/

5675

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5687

static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5676

unsigned long addr, pmd_t pmd, union mc_target *target)

5688

unsigned long addr, pmd_t pmd, union mc_target *target)

5677

{

5689

{

5678

struct page *page = NULL;

5690

struct page *page = NULL;

5679

struct page_cgroup *pc;

5691

struct page_cgroup *pc;

5680

enum mc_target_type ret = MC_TARGET_NONE;

5692

enum mc_target_type ret = MC_TARGET_NONE;

5681

5693

5682

page = pmd_page(pmd);

5694

page = pmd_page(pmd);

5683

VM_BUG_ON_PAGE(!page || !PageHead(page), page);

5695

VM_BUG_ON_PAGE(!page || !PageHead(page), page);

5684

if (!move_anon())

5696

if (!move_anon())

5685

return ret;

5697

return ret;

5686

pc = lookup_page_cgroup(page);

5698

pc = lookup_page_cgroup(page);

5687

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5699

if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {

5688

ret = MC_TARGET_PAGE;

5700

ret = MC_TARGET_PAGE;

5689

if (target) {

5701

if (target) {

5690

get_page(page);

5702

get_page(page);

5691

target->page = page;

5703

target->page = page;

5692

}

5704

}

5693

}

5705

}

5694

return ret;

5706

return ret;

5695

}

5707

}

5696

#else

5708

#else

5697

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5709

static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,

5698

unsigned long addr, pmd_t pmd, union mc_target *target)

5710

unsigned long addr, pmd_t pmd, union mc_target *target)

5699

{

5711

{

5700

return MC_TARGET_NONE;

5712

return MC_TARGET_NONE;

5701

}

5713

}

5702

#endif

5714

#endif

5703

5715

5704

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5716

static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

5705

unsigned long addr, unsigned long end,

5717

unsigned long addr, unsigned long end,

5706

struct mm_walk *walk)

5718

struct mm_walk *walk)

5707

{

5719

{

5708

struct vm_area_struct *vma = walk->private;

5720

struct vm_area_struct *vma = walk->private;

5709

pte_t *pte;

5721

pte_t *pte;

5710

spinlock_t *ptl;

5722

spinlock_t *ptl;

5711

5723

5712

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

5724

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

5713

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

5725

if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)

5714

mc.precharge += HPAGE_PMD_NR;

5726

mc.precharge += HPAGE_PMD_NR;

5715

spin_unlock(ptl);

5727

spin_unlock(ptl);

5716

return 0;

5728

return 0;

5717

}

5729

}

5718

5730

5719

if (pmd_trans_unstable(pmd))

5731

if (pmd_trans_unstable(pmd))

5720

return 0;

5732

return 0;

5721

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5733

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5722

for (; addr != end; pte++, addr += PAGE_SIZE)

5734

for (; addr != end; pte++, addr += PAGE_SIZE)

5723

if (get_mctgt_type(vma, addr, *pte, NULL))

5735

if (get_mctgt_type(vma, addr, *pte, NULL))

5724

mc.precharge++; /* increment precharge temporarily */

5736

mc.precharge++; /* increment precharge temporarily */

5725

pte_unmap_unlock(pte - 1, ptl);

5737

pte_unmap_unlock(pte - 1, ptl);

5726

cond_resched();

5738

cond_resched();

5727

5739

5728

return 0;

5740

return 0;

5729

}

5741

}

5730

5742

5731

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5743

static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)

5732

{

5744

{

5733

unsigned long precharge;

5745

unsigned long precharge;

5734

struct vm_area_struct *vma;

5746

struct vm_area_struct *vma;

5735

5747

5736

down_read(&mm->mmap_sem);

5748

down_read(&mm->mmap_sem);

5737

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5749

for (vma = mm->mmap; vma; vma = vma->vm_next) {

5738

struct mm_walk mem_cgroup_count_precharge_walk = {

5750

struct mm_walk mem_cgroup_count_precharge_walk = {

5739

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5751

.pmd_entry = mem_cgroup_count_precharge_pte_range,

5740

.mm = mm,

5752

.mm = mm,

5741

.private = vma,

5753

.private = vma,

5742

};

5754

};

5743

if (is_vm_hugetlb_page(vma))

5755

if (is_vm_hugetlb_page(vma))

5744

continue;

5756

continue;

5745

walk_page_range(vma->vm_start, vma->vm_end,

5757

walk_page_range(vma->vm_start, vma->vm_end,

5746

&mem_cgroup_count_precharge_walk);

5758

&mem_cgroup_count_precharge_walk);

5747

}

5759

}

5748

up_read(&mm->mmap_sem);

5760

up_read(&mm->mmap_sem);

5749

5761

5750

precharge = mc.precharge;

5762

precharge = mc.precharge;

5751

mc.precharge = 0;

5763

mc.precharge = 0;

5752

5764

5753

return precharge;

5765

return precharge;

5754

}

5766

}

5755

5767

5756

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5768

static int mem_cgroup_precharge_mc(struct mm_struct *mm)

5757

{

5769

{

5758

unsigned long precharge = mem_cgroup_count_precharge(mm);

5770

unsigned long precharge = mem_cgroup_count_precharge(mm);

5759

5771

5760

VM_BUG_ON(mc.moving_task);

5772

VM_BUG_ON(mc.moving_task);

5761

mc.moving_task = current;

5773

mc.moving_task = current;

5762

return mem_cgroup_do_precharge(precharge);

5774

return mem_cgroup_do_precharge(precharge);

5763

}

5775

}

5764

5776

5765

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5777

/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */

5766

static void __mem_cgroup_clear_mc(void)

5778

static void __mem_cgroup_clear_mc(void)

5767

{

5779

{

5768

struct mem_cgroup *from = mc.from;

5780

struct mem_cgroup *from = mc.from;

5769

struct mem_cgroup *to = mc.to;

5781

struct mem_cgroup *to = mc.to;

5770

int i;

5771

5782

5772

/* we must uncharge all the leftover precharges from mc.to */

5783

/* we must uncharge all the leftover precharges from mc.to */

5773

if (mc.precharge) {

5784

if (mc.precharge) {

5774

cancel_charge(mc.to, mc.precharge);

5785

cancel_charge(mc.to, mc.precharge);

5775

mc.precharge = 0;

5786

mc.precharge = 0;

5776

}

5787

}

5777

/*

5788

/*

5778

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5789

* we didn't uncharge from mc.from at mem_cgroup_move_account(), so

5779

* we must uncharge here.

5790

* we must uncharge here.

5780

*/

5791

*/

5781

if (mc.moved_charge) {

5792

if (mc.moved_charge) {

5782

cancel_charge(mc.from, mc.moved_charge);

5793

cancel_charge(mc.from, mc.moved_charge);

5783

mc.moved_charge = 0;

5794

mc.moved_charge = 0;

5784

}

5795

}

5785

/* we must fixup refcnts and charges */

5796

/* we must fixup refcnts and charges */

5786

if (mc.moved_swap) {

5797

if (mc.moved_swap) {

5787

/* uncharge swap account from the old cgroup */

5798

/* uncharge swap account from the old cgroup */

5788

if (!mem_cgroup_is_root(mc.from))

5799

if (!mem_cgroup_is_root(mc.from))

5789

page_counter_uncharge(&mc.from->memsw, mc.moved_swap);

5800

page_counter_uncharge(&mc.from->memsw, mc.moved_swap);

5790

5801

5791

/*

5802

/*

5792

* we charged both to->memory and to->memsw, so we

5803

* we charged both to->memory and to->memsw, so we

5793

* should uncharge to->memory.

5804

* should uncharge to->memory.

5794

*/

5805

*/

5795

if (!mem_cgroup_is_root(mc.to))

5806

if (!mem_cgroup_is_root(mc.to))

5796

page_counter_uncharge(&mc.to->memory, mc.moved_swap);

5807

page_counter_uncharge(&mc.to->memory, mc.moved_swap);

5797

5808

5798

for (i = 0; i < mc.moved_swap; i++)

5809

css_put_many(&mc.from->css, mc.moved_swap);

5799

css_put(&mc.from->css);

5800

5810

5801

/* we've already done css_get(mc.to) */

5811

/* we've already done css_get(mc.to) */

5802

mc.moved_swap = 0;

5812

mc.moved_swap = 0;

5803

}

5813

}

5804

memcg_oom_recover(from);

5814

memcg_oom_recover(from);

5805

memcg_oom_recover(to);

5815

memcg_oom_recover(to);

5806

wake_up_all(&mc.waitq);

5816

wake_up_all(&mc.waitq);

5807

}

5817

}

5808

5818

5809

static void mem_cgroup_clear_mc(void)

5819

static void mem_cgroup_clear_mc(void)

5810

{

5820

{

5811

struct mem_cgroup *from = mc.from;

5821

struct mem_cgroup *from = mc.from;

5812

5822

5813

/*

5823

/*

5814

* we must clear moving_task before waking up waiters at the end of

5824

* we must clear moving_task before waking up waiters at the end of

5815

* task migration.

5825

* task migration.

5816

*/

5826

*/

5817

mc.moving_task = NULL;

5827

mc.moving_task = NULL;

5818

__mem_cgroup_clear_mc();

5828

__mem_cgroup_clear_mc();

5819

spin_lock(&mc.lock);

5829

spin_lock(&mc.lock);

5820

mc.from = NULL;

5830

mc.from = NULL;

5821

mc.to = NULL;

5831

mc.to = NULL;

5822

spin_unlock(&mc.lock);

5832

spin_unlock(&mc.lock);

5823

mem_cgroup_end_move(from);

5833

mem_cgroup_end_move(from);

5824

}

5834

}

5825

5835

5826

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

5836

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

5827

struct cgroup_taskset *tset)

5837

struct cgroup_taskset *tset)

5828

{

5838

{

5829

struct task_struct *p = cgroup_taskset_first(tset);

5839

struct task_struct *p = cgroup_taskset_first(tset);

5830

int ret = 0;

5840

int ret = 0;

5831

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5841

struct mem_cgroup *memcg = mem_cgroup_from_css(css);

5832

unsigned long move_charge_at_immigrate;

5842

unsigned long move_charge_at_immigrate;

5833

5843

5834

/*

5844

/*

5835

* We are now commited to this value whatever it is. Changes in this

5845

* We are now commited to this value whatever it is. Changes in this

5836

* tunable will only affect upcoming migrations, not the current one.

5846

* tunable will only affect upcoming migrations, not the current one.

5837

* So we need to save it, and keep it going.

5847

* So we need to save it, and keep it going.

5838

*/

5848

*/

5839

move_charge_at_immigrate = memcg->move_charge_at_immigrate;

5849

move_charge_at_immigrate = memcg->move_charge_at_immigrate;

5840

if (move_charge_at_immigrate) {

5850

if (move_charge_at_immigrate) {

5841

struct mm_struct *mm;

5851

struct mm_struct *mm;

5842

struct mem_cgroup *from = mem_cgroup_from_task(p);

5852

struct mem_cgroup *from = mem_cgroup_from_task(p);

5843

5853

5844

VM_BUG_ON(from == memcg);

5854

VM_BUG_ON(from == memcg);

5845

5855

5846

mm = get_task_mm(p);

5856

mm = get_task_mm(p);

5847

if (!mm)

5857

if (!mm)

5848

return 0;

5858

return 0;

5849

/* We move charges only when we move a owner of the mm */

5859

/* We move charges only when we move a owner of the mm */

5850

if (mm->owner == p) {

5860

if (mm->owner == p) {

5851

VM_BUG_ON(mc.from);

5861

VM_BUG_ON(mc.from);

5852

VM_BUG_ON(mc.to);

5862

VM_BUG_ON(mc.to);

5853

VM_BUG_ON(mc.precharge);

5863

VM_BUG_ON(mc.precharge);

5854

VM_BUG_ON(mc.moved_charge);

5864

VM_BUG_ON(mc.moved_charge);

5855

VM_BUG_ON(mc.moved_swap);

5865

VM_BUG_ON(mc.moved_swap);

5856

mem_cgroup_start_move(from);

5866

mem_cgroup_start_move(from);

5857

spin_lock(&mc.lock);

5867

spin_lock(&mc.lock);

5858

mc.from = from;

5868

mc.from = from;

5859

mc.to = memcg;

5869

mc.to = memcg;

5860

mc.immigrate_flags = move_charge_at_immigrate;

5870

mc.immigrate_flags = move_charge_at_immigrate;

5861

spin_unlock(&mc.lock);

5871

spin_unlock(&mc.lock);

5862

/* We set mc.moving_task later */

5872

/* We set mc.moving_task later */

5863

5873

5864

ret = mem_cgroup_precharge_mc(mm);

5874

ret = mem_cgroup_precharge_mc(mm);

5865

if (ret)

5875

if (ret)

5866

mem_cgroup_clear_mc();

5876

mem_cgroup_clear_mc();

5867

}

5877

}

5868

mmput(mm);

5878

mmput(mm);

5869

}

5879

}

5870

return ret;

5880

return ret;

5871

}

5881

}

5872

5882

5873

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

5883

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

5874

struct cgroup_taskset *tset)

5884

struct cgroup_taskset *tset)

5875

{

5885

{

5876

mem_cgroup_clear_mc();

5886

mem_cgroup_clear_mc();

5877

}

5887

}

5878

5888

5879

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5889

static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

5880

unsigned long addr, unsigned long end,

5890

unsigned long addr, unsigned long end,

5881

struct mm_walk *walk)

5891

struct mm_walk *walk)

5882

{

5892

{

5883

int ret = 0;

5893

int ret = 0;

5884

struct vm_area_struct *vma = walk->private;

5894

struct vm_area_struct *vma = walk->private;

5885

pte_t *pte;

5895

pte_t *pte;

5886

spinlock_t *ptl;

5896

spinlock_t *ptl;

5887

enum mc_target_type target_type;

5897

enum mc_target_type target_type;

5888

union mc_target target;

5898

union mc_target target;

5889

struct page *page;

5899

struct page *page;

5890

struct page_cgroup *pc;

5900

struct page_cgroup *pc;

5891

5901

5892

/*

5902

/*

5893

* We don't take compound_lock() here but no race with splitting thp

5903

* We don't take compound_lock() here but no race with splitting thp

5894

* happens because:

5904

* happens because:

5895

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

5905

* - if pmd_trans_huge_lock() returns 1, the relevant thp is not

5896

* under splitting, which means there's no concurrent thp split,

5906

* under splitting, which means there's no concurrent thp split,

5897

* - if another thread runs into split_huge_page() just after we

5907

* - if another thread runs into split_huge_page() just after we

5898

* entered this if-block, the thread must wait for page table lock

5908

* entered this if-block, the thread must wait for page table lock

5899

* to be unlocked in __split_huge_page_splitting(), where the main

5909

* to be unlocked in __split_huge_page_splitting(), where the main

5900

* part of thp split is not executed yet.

5910

* part of thp split is not executed yet.

5901

*/

5911

*/

5902

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

5912

if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {

5903

if (mc.precharge < HPAGE_PMD_NR) {

5913

if (mc.precharge < HPAGE_PMD_NR) {

5904

spin_unlock(ptl);

5914

spin_unlock(ptl);

5905

return 0;

5915

return 0;

5906

}

5916

}

5907

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

5917

target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);

5908

if (target_type == MC_TARGET_PAGE) {

5918

if (target_type == MC_TARGET_PAGE) {

5909

page = target.page;

5919

page = target.page;

5910

if (!isolate_lru_page(page)) {

5920

if (!isolate_lru_page(page)) {

5911

pc = lookup_page_cgroup(page);

5921

pc = lookup_page_cgroup(page);

5912

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

5922

if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,

5913

pc, mc.from, mc.to)) {

5923

pc, mc.from, mc.to)) {

5914

mc.precharge -= HPAGE_PMD_NR;

5924

mc.precharge -= HPAGE_PMD_NR;

5915

mc.moved_charge += HPAGE_PMD_NR;

5925

mc.moved_charge += HPAGE_PMD_NR;

5916

}

5926

}

5917

putback_lru_page(page);

5927

putback_lru_page(page);

5918

}

5928

}

5919

put_page(page);

5929

put_page(page);

5920

}

5930

}

5921

spin_unlock(ptl);

5931

spin_unlock(ptl);

5922

return 0;

5932

return 0;

5923

}

5933

}

5924

5934

5925

if (pmd_trans_unstable(pmd))

5935

if (pmd_trans_unstable(pmd))

5926

return 0;

5936

return 0;

5927

retry:

5937

retry:

5928

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5938

pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);

5929

for (; addr != end; addr += PAGE_SIZE) {

5939

for (; addr != end; addr += PAGE_SIZE) {

5930

pte_t ptent = *(pte++);

5940

pte_t ptent = *(pte++);

5931

swp_entry_t ent;

5941

swp_entry_t ent;

5932

5942

5933

if (!mc.precharge)

5943

if (!mc.precharge)

5934

break;

5944

break;

5935

5945

5936

switch (get_mctgt_type(vma, addr, ptent, &target)) {

5946

switch (get_mctgt_type(vma, addr, ptent, &target)) {

5937

case MC_TARGET_PAGE:

5947

case MC_TARGET_PAGE:

5938

page = target.page;

5948

page = target.page;

5939

if (isolate_lru_page(page))

5949

if (isolate_lru_page(page))

5940

goto put;

5950

goto put;

5941

pc = lookup_page_cgroup(page);

5951

pc = lookup_page_cgroup(page);

5942

if (!mem_cgroup_move_account(page, 1, pc,

5952

if (!mem_cgroup_move_account(page, 1, pc,

5943

mc.from, mc.to)) {

5953

mc.from, mc.to)) {

5944

mc.precharge--;

5954

mc.precharge--;

5945

/* we uncharge from mc.from later. */

5955

/* we uncharge from mc.from later. */

5946

mc.moved_charge++;

5956

mc.moved_charge++;

5947

}

5957

}

5948

putback_lru_page(page);

5958

putback_lru_page(page);

5949

put: /* get_mctgt_type() gets the page */

5959

put: /* get_mctgt_type() gets the page */

5950

put_page(page);

5960

put_page(page);

5951

break;

5961

break;

5952

case MC_TARGET_SWAP:

5962

case MC_TARGET_SWAP:

5953

ent = target.ent;

5963

ent = target.ent;

5954

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

5964

if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {

5955

mc.precharge--;

5965

mc.precharge--;

5956

/* we fixup refcnts and charges later. */

5966

/* we fixup refcnts and charges later. */

5957

mc.moved_swap++;

5967

mc.moved_swap++;

5958

}

5968

}

5959

break;

5969

break;

5960

default:

5970

default:

5961

break;

5971

break;

5962

}

5972

}

5963

}

5973

}

5964

pte_unmap_unlock(pte - 1, ptl);

5974

pte_unmap_unlock(pte - 1, ptl);

5965

cond_resched();

5975

cond_resched();

5966

5976

5967

if (addr != end) {

5977

if (addr != end) {

5968

/*

5978

/*

5969

* We have consumed all precharges we got in can_attach().

5979

* We have consumed all precharges we got in can_attach().

5970

* We try charge one by one, but don't do any additional

5980

* We try charge one by one, but don't do any additional

5971

* charges to mc.to if we have failed in charge once in attach()

5981

* charges to mc.to if we have failed in charge once in attach()

5972

* phase.

5982

* phase.

5973

*/

5983

*/

5974

ret = mem_cgroup_do_precharge(1);

5984

ret = mem_cgroup_do_precharge(1);

5975

if (!ret)

5985

if (!ret)

5976

goto retry;

5986

goto retry;

5977

}

5987

}

5978

5988

5979

return ret;

5989

return ret;

5980

}

5990

}

5981

5991

5982

static void mem_cgroup_move_charge(struct mm_struct *mm)

5992

static void mem_cgroup_move_charge(struct mm_struct *mm)

5983

{

5993

{

5984

struct vm_area_struct *vma;

5994

struct vm_area_struct *vma;

5985

5995

5986

lru_add_drain_all();

5996

lru_add_drain_all();

5987

retry:

5997

retry:

5988

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5998

if (unlikely(!down_read_trylock(&mm->mmap_sem))) {

5989

/*

5999

/*

5990

* Someone who are holding the mmap_sem might be waiting in

6000

* Someone who are holding the mmap_sem might be waiting in

5991

* waitq. So we cancel all extra charges, wake up all waiters,

6001

* waitq. So we cancel all extra charges, wake up all waiters,

5992

* and retry. Because we cancel precharges, we might not be able

6002

* and retry. Because we cancel precharges, we might not be able

5993

* to move enough charges, but moving charge is a best-effort

6003

* to move enough charges, but moving charge is a best-effort

5994

* feature anyway, so it wouldn't be a big problem.

6004

* feature anyway, so it wouldn't be a big problem.

5995

*/

6005

*/

5996

__mem_cgroup_clear_mc();

6006

__mem_cgroup_clear_mc();

5997

cond_resched();

6007

cond_resched();

5998

goto retry;

6008

goto retry;

5999

}

6009

}

6000

for (vma = mm->mmap; vma; vma = vma->vm_next) {

6010

for (vma = mm->mmap; vma; vma = vma->vm_next) {

6001

int ret;

6011

int ret;

6002

struct mm_walk mem_cgroup_move_charge_walk = {

6012

struct mm_walk mem_cgroup_move_charge_walk = {

6003

.pmd_entry = mem_cgroup_move_charge_pte_range,

6013

.pmd_entry = mem_cgroup_move_charge_pte_range,

6004

.mm = mm,

6014

.mm = mm,

6005

.private = vma,

6015

.private = vma,

6006

};

6016

};

6007

if (is_vm_hugetlb_page(vma))

6017

if (is_vm_hugetlb_page(vma))

6008

continue;

6018

continue;

6009

ret = walk_page_range(vma->vm_start, vma->vm_end,

6019

ret = walk_page_range(vma->vm_start, vma->vm_end,

6010

&mem_cgroup_move_charge_walk);

6020

&mem_cgroup_move_charge_walk);

6011

if (ret)

6021

if (ret)

6012

/*

6022

/*

6013

* means we have consumed all precharges and failed in

6023

* means we have consumed all precharges and failed in

6014

* doing additional charge. Just abandon here.

6024

* doing additional charge. Just abandon here.

6015

*/

6025

*/

6016

break;

6026

break;

6017

}

6027

}

6018

up_read(&mm->mmap_sem);

6028

up_read(&mm->mmap_sem);

6019

}

6029

}

6020

6030

6021

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

6031

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

6022

struct cgroup_taskset *tset)

6032

struct cgroup_taskset *tset)

6023

{

6033

{

6024

struct task_struct *p = cgroup_taskset_first(tset);

6034

struct task_struct *p = cgroup_taskset_first(tset);

6025

struct mm_struct *mm = get_task_mm(p);

6035

struct mm_struct *mm = get_task_mm(p);

6026

6036

6027

if (mm) {

6037

if (mm) {

6028

if (mc.to)

6038

if (mc.to)

6029

mem_cgroup_move_charge(mm);

6039

mem_cgroup_move_charge(mm);

6030

mmput(mm);

6040

mmput(mm);

6031

}

6041

}

6032

if (mc.to)

6042

if (mc.to)

6033

mem_cgroup_clear_mc();

6043

mem_cgroup_clear_mc();

6034

}

6044

}

6035

#else /* !CONFIG_MMU */

6045

#else /* !CONFIG_MMU */

6036

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

6046

static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,

6037

struct cgroup_taskset *tset)

6047

struct cgroup_taskset *tset)

6038

{

6048

{

6039

return 0;

6049

return 0;

6040

}

6050

}

6041

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

6051

static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,

6042

struct cgroup_taskset *tset)

6052

struct cgroup_taskset *tset)

6043

{

6053

{

6044

}

6054

}

6045

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

6055

static void mem_cgroup_move_task(struct cgroup_subsys_state *css,

6046

struct cgroup_taskset *tset)

6056

struct cgroup_taskset *tset)

6047

{

6057

{

6048

}

6058

}

6049

#endif

6059

#endif

6050

6060

6051

/*

6061

/*

6052

* Cgroup retains root cgroups across [un]mount cycles making it necessary

6062

* Cgroup retains root cgroups across [un]mount cycles making it necessary

6053

* to verify whether we're attached to the default hierarchy on each mount

6063

* to verify whether we're attached to the default hierarchy on each mount

6054

* attempt.

6064

* attempt.

6055

*/

6065

*/

6056

static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)

6066

static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)

6057

{

6067

{

6058

/*

6068

/*

6059

* use_hierarchy is forced on the default hierarchy. cgroup core

6069

* use_hierarchy is forced on the default hierarchy. cgroup core

6060

* guarantees that @root doesn't have any children, so turning it

6070

* guarantees that @root doesn't have any children, so turning it

6061

* on for the root memcg is enough.

6071

* on for the root memcg is enough.

6062

*/

6072

*/

6063

if (cgroup_on_dfl(root_css->cgroup))

6073

if (cgroup_on_dfl(root_css->cgroup))

6064

mem_cgroup_from_css(root_css)->use_hierarchy = true;

6074

mem_cgroup_from_css(root_css)->use_hierarchy = true;

6065

}

6075

}

6066

6076

6067

struct cgroup_subsys memory_cgrp_subsys = {

6077

struct cgroup_subsys memory_cgrp_subsys = {

6068

.css_alloc = mem_cgroup_css_alloc,

6078

.css_alloc = mem_cgroup_css_alloc,

6069

.css_online = mem_cgroup_css_online,

6079

.css_online = mem_cgroup_css_online,

6070

.css_offline = mem_cgroup_css_offline,

6080

.css_offline = mem_cgroup_css_offline,

6071

.css_free = mem_cgroup_css_free,

6081

.css_free = mem_cgroup_css_free,

6072

.css_reset = mem_cgroup_css_reset,

6082

.css_reset = mem_cgroup_css_reset,

6073

.can_attach = mem_cgroup_can_attach,

6083

.can_attach = mem_cgroup_can_attach,

6074

.cancel_attach = mem_cgroup_cancel_attach,

6084

.cancel_attach = mem_cgroup_cancel_attach,

6075

.attach = mem_cgroup_move_task,

6085

.attach = mem_cgroup_move_task,

6076

.bind = mem_cgroup_bind,

6086

.bind = mem_cgroup_bind,

6077

.legacy_cftypes = mem_cgroup_files,

6087

.legacy_cftypes = mem_cgroup_files,

6078

.early_init = 0,

6088

.early_init = 0,

6079

};

6089

};

6080

6090

6081

#ifdef CONFIG_MEMCG_SWAP

6091

#ifdef CONFIG_MEMCG_SWAP

6082

static int __init enable_swap_account(char *s)

6092

static int __init enable_swap_account(char *s)

6083

{

6093

{

6084

if (!strcmp(s, "1"))

6094

if (!strcmp(s, "1"))

6085

really_do_swap_account = 1;

6095

really_do_swap_account = 1;

6086

else if (!strcmp(s, "0"))

6096

else if (!strcmp(s, "0"))

6087

really_do_swap_account = 0;

6097

really_do_swap_account = 0;

6088

return 1;

6098

return 1;

6089

}

6099

}

6090

__setup("swapaccount=", enable_swap_account);

6100

__setup("swapaccount=", enable_swap_account);

6091

6101

6092

static void __init memsw_file_init(void)

6102

static void __init memsw_file_init(void)

6093

{

6103

{

6094

WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,

6104

WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,

6095

memsw_cgroup_files));

6105

memsw_cgroup_files));

6096

}

6106

}

6097

6107

6098

static void __init enable_swap_cgroup(void)

6108

static void __init enable_swap_cgroup(void)

6099

{

6109

{

6100

if (!mem_cgroup_disabled() && really_do_swap_account) {

6110

if (!mem_cgroup_disabled() && really_do_swap_account) {

6101

do_swap_account = 1;

6111

do_swap_account = 1;

6102

memsw_file_init();

6112

memsw_file_init();

6103

}

6113

}

6104

}

6114

}

6105

6115

6106

#else

6116

#else

6107

static void __init enable_swap_cgroup(void)

6117

static void __init enable_swap_cgroup(void)

6108

{

6118

{

6109

}

6119

}

6110

#endif

6120

#endif

6111

6121

6112

#ifdef CONFIG_MEMCG_SWAP

6122

#ifdef CONFIG_MEMCG_SWAP

6113

/**

6123

/**

6114

* mem_cgroup_swapout - transfer a memsw charge to swap

6124

* mem_cgroup_swapout - transfer a memsw charge to swap

6115

* @page: page whose memsw charge to transfer

6125

* @page: page whose memsw charge to transfer

6116

* @entry: swap entry to move the charge to

6126

* @entry: swap entry to move the charge to

6117

*

6127

*

6118

* Transfer the memsw charge of @page to @entry.

6128

* Transfer the memsw charge of @page to @entry.

6119

*/

6129

*/

6120

void mem_cgroup_swapout(struct page *page, swp_entry_t entry)

6130

void mem_cgroup_swapout(struct page *page, swp_entry_t entry)

6121

{

6131

{

6122

struct page_cgroup *pc;

6132

struct page_cgroup *pc;

6123

unsigned short oldid;

6133

unsigned short oldid;

6124

6134

6125

VM_BUG_ON_PAGE(PageLRU(page), page);

6135

VM_BUG_ON_PAGE(PageLRU(page), page);

6126

VM_BUG_ON_PAGE(page_count(page), page);

6136

VM_BUG_ON_PAGE(page_count(page), page);

6127

6137

6128

if (!do_swap_account)

6138

if (!do_swap_account)

6129

return;

6139

return;

6130

6140

6131

pc = lookup_page_cgroup(page);

6141

pc = lookup_page_cgroup(page);

6132

6142

6133

/* Readahead page, never charged */

6143

/* Readahead page, never charged */

6134

if (!PageCgroupUsed(pc))

6144

if (!PageCgroupUsed(pc))

6135

return;

6145

return;

6136

6146

6137

VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);

6147

VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);

6138

6148

6139

oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));

6149

oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));

6140

VM_BUG_ON_PAGE(oldid, page);

6150

VM_BUG_ON_PAGE(oldid, page);

6141

6151

6142

pc->flags &= ~PCG_MEMSW;

6152

pc->flags &= ~PCG_MEMSW;

6143

css_get(&pc->mem_cgroup->css);

6153

css_get(&pc->mem_cgroup->css);

6144

mem_cgroup_swap_statistics(pc->mem_cgroup, true);

6154

mem_cgroup_swap_statistics(pc->mem_cgroup, true);

6145

}

6155

}

6146

6156

6147

/**

6157

/**

6148

* mem_cgroup_uncharge_swap - uncharge a swap entry

6158

* mem_cgroup_uncharge_swap - uncharge a swap entry

6149

* @entry: swap entry to uncharge

6159

* @entry: swap entry to uncharge

6150

*

6160

*

6151

* Drop the memsw charge associated with @entry.

6161

* Drop the memsw charge associated with @entry.

6152

*/

6162

*/

6153

void mem_cgroup_uncharge_swap(swp_entry_t entry)

6163

void mem_cgroup_uncharge_swap(swp_entry_t entry)

6154

{

6164

{

6155

struct mem_cgroup *memcg;

6165

struct mem_cgroup *memcg;

6156

unsigned short id;

6166

unsigned short id;

6157

6167

6158

if (!do_swap_account)

6168

if (!do_swap_account)

6159

return;

6169

return;

6160

6170

6161

id = swap_cgroup_record(entry, 0);

6171

id = swap_cgroup_record(entry, 0);

6162

rcu_read_lock();

6172

rcu_read_lock();

6163

memcg = mem_cgroup_lookup(id);

6173

memcg = mem_cgroup_lookup(id);

6164

if (memcg) {

6174

if (memcg) {

6165

if (!mem_cgroup_is_root(memcg))

6175

if (!mem_cgroup_is_root(memcg))

6166

page_counter_uncharge(&memcg->memsw, 1);

6176

page_counter_uncharge(&memcg->memsw, 1);

6167

mem_cgroup_swap_statistics(memcg, false);

6177

mem_cgroup_swap_statistics(memcg, false);

6168

css_put(&memcg->css);

6178

css_put(&memcg->css);

6169

}

6179

}

6170

rcu_read_unlock();

6180

rcu_read_unlock();

6171

}

6181

}

6172

#endif

6182

#endif

6173

6183

6174

/**

6184

/**

6175

* mem_cgroup_try_charge - try charging a page

6185

* mem_cgroup_try_charge - try charging a page

6176

* @page: page to charge

6186

* @page: page to charge

6177

* @mm: mm context of the victim

6187

* @mm: mm context of the victim

6178

* @gfp_mask: reclaim mode

6188

* @gfp_mask: reclaim mode

6179

* @memcgp: charged memcg return

6189

* @memcgp: charged memcg return

6180

*

6190

*

6181

* Try to charge @page to the memcg that @mm belongs to, reclaiming

6191

* Try to charge @page to the memcg that @mm belongs to, reclaiming

6182

* pages according to @gfp_mask if necessary.

6192

* pages according to @gfp_mask if necessary.

6183

*

6193

*

6184

* Returns 0 on success, with *@memcgp pointing to the charged memcg.

6194

* Returns 0 on success, with *@memcgp pointing to the charged memcg.

6185

* Otherwise, an error code is returned.

6195

* Otherwise, an error code is returned.

6186

*

6196

*

6187

* After page->mapping has been set up, the caller must finalize the

6197

* After page->mapping has been set up, the caller must finalize the

6188

* charge with mem_cgroup_commit_charge(). Or abort the transaction

6198

* charge with mem_cgroup_commit_charge(). Or abort the transaction

6189

* with mem_cgroup_cancel_charge() in case page instantiation fails.

6199

* with mem_cgroup_cancel_charge() in case page instantiation fails.

6190

*/

6200

*/

6191

int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,

6201

int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,

6192

gfp_t gfp_mask, struct mem_cgroup **memcgp)

6202

gfp_t gfp_mask, struct mem_cgroup **memcgp)

6193

{

6203

{

6194

struct mem_cgroup *memcg = NULL;

6204

struct mem_cgroup *memcg = NULL;

6195

unsigned int nr_pages = 1;

6205

unsigned int nr_pages = 1;

6196

int ret = 0;

6206

int ret = 0;

6197

6207

6198

if (mem_cgroup_disabled())

6208

if (mem_cgroup_disabled())

6199

goto out;

6209

goto out;

6200

6210

6201

if (PageSwapCache(page)) {

6211

if (PageSwapCache(page)) {

6202

struct page_cgroup *pc = lookup_page_cgroup(page);

6212

struct page_cgroup *pc = lookup_page_cgroup(page);

6203

/*

6213

/*

6204

* Every swap fault against a single page tries to charge the

6214

* Every swap fault against a single page tries to charge the

6205

* page, bail as early as possible. shmem_unuse() encounters

6215

* page, bail as early as possible. shmem_unuse() encounters

6206

* already charged pages, too. The USED bit is protected by

6216

* already charged pages, too. The USED bit is protected by

6207

* the page lock, which serializes swap cache removal, which

6217

* the page lock, which serializes swap cache removal, which

6208

* in turn serializes uncharging.

6218

* in turn serializes uncharging.

6209

*/

6219

*/

6210

if (PageCgroupUsed(pc))

6220

if (PageCgroupUsed(pc))

6211

goto out;

6221

goto out;

6212

}

6222

}

6213

6223

6214

if (PageTransHuge(page)) {

6224

if (PageTransHuge(page)) {

6215

nr_pages <<= compound_order(page);

6225

nr_pages <<= compound_order(page);

6216

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6226

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6217

}

6227

}

6218

6228

6219

if (do_swap_account && PageSwapCache(page))

6229

if (do_swap_account && PageSwapCache(page))

6220

memcg = try_get_mem_cgroup_from_page(page);

6230

memcg = try_get_mem_cgroup_from_page(page);

6221

if (!memcg)

6231

if (!memcg)

6222

memcg = get_mem_cgroup_from_mm(mm);

6232

memcg = get_mem_cgroup_from_mm(mm);

6223

6233

6224

ret = try_charge(memcg, gfp_mask, nr_pages);

6234

ret = try_charge(memcg, gfp_mask, nr_pages);

6225

6235

6226

css_put(&memcg->css);

6236

css_put(&memcg->css);

6227

6237

6228

if (ret == -EINTR) {

6238

if (ret == -EINTR) {

6229

memcg = root_mem_cgroup;

6239

memcg = root_mem_cgroup;

6230

ret = 0;

6240

ret = 0;

6231

}

6241

}

6232

out:

6242

out:

6233

*memcgp = memcg;

6243

*memcgp = memcg;

6234

return ret;

6244

return ret;

6235

}

6245

}

6236

6246

6237

/**

6247

/**

6238

* mem_cgroup_commit_charge - commit a page charge

6248

* mem_cgroup_commit_charge - commit a page charge

6239

* @page: page to charge

6249

* @page: page to charge

6240

* @memcg: memcg to charge the page to

6250

* @memcg: memcg to charge the page to

6241

* @lrucare: page might be on LRU already

6251

* @lrucare: page might be on LRU already

6242

*

6252

*

6243

* Finalize a charge transaction started by mem_cgroup_try_charge(),

6253

* Finalize a charge transaction started by mem_cgroup_try_charge(),

6244

* after page->mapping has been set up. This must happen atomically

6254

* after page->mapping has been set up. This must happen atomically

6245

* as part of the page instantiation, i.e. under the page table lock

6255

* as part of the page instantiation, i.e. under the page table lock

6246

* for anonymous pages, under the page lock for page and swap cache.

6256

* for anonymous pages, under the page lock for page and swap cache.

6247

*

6257

*

6248

* In addition, the page must not be on the LRU during the commit, to

6258

* In addition, the page must not be on the LRU during the commit, to

6249

* prevent racing with task migration. If it might be, use @lrucare.

6259

* prevent racing with task migration. If it might be, use @lrucare.

6250

*

6260

*

6251

* Use mem_cgroup_cancel_charge() to cancel the transaction instead.

6261

* Use mem_cgroup_cancel_charge() to cancel the transaction instead.

6252

*/

6262

*/

6253

void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,

6263

void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,

6254

bool lrucare)

6264

bool lrucare)

6255

{

6265

{

6256

unsigned int nr_pages = 1;

6266

unsigned int nr_pages = 1;

6257

6267

6258

VM_BUG_ON_PAGE(!page->mapping, page);

6268

VM_BUG_ON_PAGE(!page->mapping, page);

6259

VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);

6269

VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);

6260

6270

6261

if (mem_cgroup_disabled())

6271

if (mem_cgroup_disabled())

6262

return;

6272

return;

6263

/*

6273

/*

6264

* Swap faults will attempt to charge the same page multiple

6274

* Swap faults will attempt to charge the same page multiple

6265

* times. But reuse_swap_page() might have removed the page

6275

* times. But reuse_swap_page() might have removed the page

6266

* from swapcache already, so we can't check PageSwapCache().

6276

* from swapcache already, so we can't check PageSwapCache().

6267

*/

6277

*/

6268

if (!memcg)

6278

if (!memcg)

6269

return;

6279

return;

6270

6280

6271

commit_charge(page, memcg, lrucare);

6281

commit_charge(page, memcg, lrucare);

6272

6282

6273

if (PageTransHuge(page)) {

6283

if (PageTransHuge(page)) {

6274

nr_pages <<= compound_order(page);

6284

nr_pages <<= compound_order(page);

6275

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6285

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6276

}

6286

}

6277

6287

6278

local_irq_disable();

6288

local_irq_disable();

6279

mem_cgroup_charge_statistics(memcg, page, nr_pages);

6289

mem_cgroup_charge_statistics(memcg, page, nr_pages);

6280

memcg_check_events(memcg, page);

6290

memcg_check_events(memcg, page);

6281

local_irq_enable();

6291

local_irq_enable();

6282

6292

6283

if (do_swap_account && PageSwapCache(page)) {

6293

if (do_swap_account && PageSwapCache(page)) {

6284

swp_entry_t entry = { .val = page_private(page) };

6294

swp_entry_t entry = { .val = page_private(page) };

6285

/*

6295

/*

6286

* The swap entry might not get freed for a long time,

6296

* The swap entry might not get freed for a long time,

6287

* let's not wait for it. The page already received a

6297

* let's not wait for it. The page already received a

6288

* memory+swap charge, drop the swap entry duplicate.

6298

* memory+swap charge, drop the swap entry duplicate.

6289

*/

6299

*/

6290

mem_cgroup_uncharge_swap(entry);

6300

mem_cgroup_uncharge_swap(entry);

6291

}

6301

}

6292

}

6302

}

6293

6303

6294

/**

6304

/**

6295

* mem_cgroup_cancel_charge - cancel a page charge

6305

* mem_cgroup_cancel_charge - cancel a page charge

6296

* @page: page to charge

6306

* @page: page to charge

6297

* @memcg: memcg to charge the page to

6307

* @memcg: memcg to charge the page to

6298

*

6308

*

6299

* Cancel a charge transaction started by mem_cgroup_try_charge().

6309

* Cancel a charge transaction started by mem_cgroup_try_charge().

6300

*/

6310

*/

6301

void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)

6311

void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)

6302

{

6312

{

6303

unsigned int nr_pages = 1;

6313

unsigned int nr_pages = 1;

6304

6314

6305

if (mem_cgroup_disabled())

6315

if (mem_cgroup_disabled())

6306

return;

6316

return;

6307

/*

6317

/*

6308

* Swap faults will attempt to charge the same page multiple

6318

* Swap faults will attempt to charge the same page multiple

6309

* times. But reuse_swap_page() might have removed the page

6319

* times. But reuse_swap_page() might have removed the page

6310

* from swapcache already, so we can't check PageSwapCache().

6320

* from swapcache already, so we can't check PageSwapCache().

6311

*/

6321

*/

6312

if (!memcg)

6322

if (!memcg)

6313

return;

6323

return;

6314

6324

6315

if (PageTransHuge(page)) {

6325

if (PageTransHuge(page)) {

6316

nr_pages <<= compound_order(page);

6326

nr_pages <<= compound_order(page);

6317

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6327

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6318

}

6328

}

6319

6329

6320

cancel_charge(memcg, nr_pages);

6330

cancel_charge(memcg, nr_pages);

6321

}

6331

}

6322

6332

6323

static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,

6333

static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,

6324

unsigned long nr_mem, unsigned long nr_memsw,

6334

unsigned long nr_mem, unsigned long nr_memsw,

6325

unsigned long nr_anon, unsigned long nr_file,

6335

unsigned long nr_anon, unsigned long nr_file,

6326

unsigned long nr_huge, struct page *dummy_page)

6336

unsigned long nr_huge, struct page *dummy_page)

6327

{

6337

{

6328

unsigned long flags;

6338

unsigned long flags;

6329

6339

6330

if (!mem_cgroup_is_root(memcg)) {

6340

if (!mem_cgroup_is_root(memcg)) {

6331

if (nr_mem)

6341

if (nr_mem)

6332

page_counter_uncharge(&memcg->memory, nr_mem);

6342

page_counter_uncharge(&memcg->memory, nr_mem);

6333

if (nr_memsw)

6343

if (nr_memsw)

6334

page_counter_uncharge(&memcg->memsw, nr_memsw);

6344

page_counter_uncharge(&memcg->memsw, nr_memsw);

6335

memcg_oom_recover(memcg);

6345

memcg_oom_recover(memcg);

6336

}

6346

}

6337

6347

6338

local_irq_save(flags);

6348

local_irq_save(flags);

6339

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);

6349

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);

6340

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);

6350

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);

6341

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);

6351

__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);

6342

__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);

6352

__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);

6343

__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);

6353

__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);

6344

memcg_check_events(memcg, dummy_page);

6354

memcg_check_events(memcg, dummy_page);

6345

local_irq_restore(flags);

6355

local_irq_restore(flags);

6356

6357

if (!mem_cgroup_is_root(memcg))

6358

css_put_many(&memcg->css, max(nr_mem, nr_memsw));

6346

}

6359

}

6347

6360

6348

static void uncharge_list(struct list_head *page_list)

6361

static void uncharge_list(struct list_head *page_list)

6349

{

6362

{

6350

struct mem_cgroup *memcg = NULL;

6363

struct mem_cgroup *memcg = NULL;

6351

unsigned long nr_memsw = 0;

6364

unsigned long nr_memsw = 0;

6352

unsigned long nr_anon = 0;

6365

unsigned long nr_anon = 0;

6353

unsigned long nr_file = 0;

6366

unsigned long nr_file = 0;

6354

unsigned long nr_huge = 0;

6367

unsigned long nr_huge = 0;

6355

unsigned long pgpgout = 0;

6368

unsigned long pgpgout = 0;

6356

unsigned long nr_mem = 0;

6369

unsigned long nr_mem = 0;

6357

struct list_head *next;

6370

struct list_head *next;

6358

struct page *page;

6371

struct page *page;

6359

6372

6360

next = page_list->next;

6373

next = page_list->next;

6361

do {

6374

do {

6362

unsigned int nr_pages = 1;

6375

unsigned int nr_pages = 1;

6363

struct page_cgroup *pc;

6376

struct page_cgroup *pc;

6364

6377

6365

page = list_entry(next, struct page, lru);

6378

page = list_entry(next, struct page, lru);

6366

next = page->lru.next;

6379

next = page->lru.next;

6367

6380

6368

VM_BUG_ON_PAGE(PageLRU(page), page);

6381

VM_BUG_ON_PAGE(PageLRU(page), page);

6369

VM_BUG_ON_PAGE(page_count(page), page);

6382

VM_BUG_ON_PAGE(page_count(page), page);

6370

6383

6371

pc = lookup_page_cgroup(page);

6384

pc = lookup_page_cgroup(page);

6372

if (!PageCgroupUsed(pc))

6385

if (!PageCgroupUsed(pc))

6373

continue;

6386

continue;

6374

6387

6375

/*

6388

/*

6376

* Nobody should be changing or seriously looking at

6389

* Nobody should be changing or seriously looking at

6377

* pc->mem_cgroup and pc->flags at this point, we have

6390

* pc->mem_cgroup and pc->flags at this point, we have

6378

* fully exclusive access to the page.

6391

* fully exclusive access to the page.

6379

*/

6392

*/

6380

6393

6381

if (memcg != pc->mem_cgroup) {

6394

if (memcg != pc->mem_cgroup) {

6382

if (memcg) {

6395

if (memcg) {

6383

uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,

6396

uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,

6384

nr_anon, nr_file, nr_huge, page);

6397

nr_anon, nr_file, nr_huge, page);

6385

pgpgout = nr_mem = nr_memsw = 0;

6398

pgpgout = nr_mem = nr_memsw = 0;

6386

nr_anon = nr_file = nr_huge = 0;

6399

nr_anon = nr_file = nr_huge = 0;

6387

}

6400

}

6388

memcg = pc->mem_cgroup;

6401

memcg = pc->mem_cgroup;

6389

}

6402

}

6390

6403

6391

if (PageTransHuge(page)) {

6404

if (PageTransHuge(page)) {

6392

nr_pages <<= compound_order(page);

6405

nr_pages <<= compound_order(page);

6393

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6406

VM_BUG_ON_PAGE(!PageTransHuge(page), page);

6394

nr_huge += nr_pages;

6407

nr_huge += nr_pages;

6395

}

6408

}

6396

6409

6397

if (PageAnon(page))

6410

if (PageAnon(page))

6398

nr_anon += nr_pages;

6411

nr_anon += nr_pages;

6399

else

6412

else

6400

nr_file += nr_pages;

6413

nr_file += nr_pages;

6401

6414

6402

if (pc->flags & PCG_MEM)

6415

if (pc->flags & PCG_MEM)

6403

nr_mem += nr_pages;

6416

nr_mem += nr_pages;

6404

if (pc->flags & PCG_MEMSW)

6417

if (pc->flags & PCG_MEMSW)

6405

nr_memsw += nr_pages;

6418

nr_memsw += nr_pages;

6406

pc->flags = 0;

6419

pc->flags = 0;

6407

6420

6408

pgpgout++;

6421

pgpgout++;

6409

} while (next != page_list);

6422

} while (next != page_list);

6410

6423

6411

if (memcg)

6424

if (memcg)

6412

uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,

6425

uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,

6413

nr_anon, nr_file, nr_huge, page);

6426

nr_anon, nr_file, nr_huge, page);

6414

}

6427

}

6415

6428

6416

/**

6429

/**

6417

* mem_cgroup_uncharge - uncharge a page

6430

* mem_cgroup_uncharge - uncharge a page

6418

* @page: page to uncharge

6431

* @page: page to uncharge

6419

*

6432

*

6420

* Uncharge a page previously charged with mem_cgroup_try_charge() and

6433

* Uncharge a page previously charged with mem_cgroup_try_charge() and

6421

* mem_cgroup_commit_charge().

6434

* mem_cgroup_commit_charge().

6422

*/

6435

*/

6423

void mem_cgroup_uncharge(struct page *page)

6436

void mem_cgroup_uncharge(struct page *page)

6424

{

6437

{

6425

struct page_cgroup *pc;

6438

struct page_cgroup *pc;

6426

6439

6427

if (mem_cgroup_disabled())

6440

if (mem_cgroup_disabled())

6428

return;

6441

return;

6429

6442

6430

/* Don't touch page->lru of any random page, pre-check: */

6443

/* Don't touch page->lru of any random page, pre-check: */

6431

pc = lookup_page_cgroup(page);

6444

pc = lookup_page_cgroup(page);

6432

if (!PageCgroupUsed(pc))

6445

if (!PageCgroupUsed(pc))

6433

return;

6446

return;

6434

6447

6435

INIT_LIST_HEAD(&page->lru);

6448

INIT_LIST_HEAD(&page->lru);

6436

uncharge_list(&page->lru);

6449

uncharge_list(&page->lru);

6437

}

6450

}

6438

6451

6439

/**

6452

/**

6440

* mem_cgroup_uncharge_list - uncharge a list of page

6453

* mem_cgroup_uncharge_list - uncharge a list of page

6441

* @page_list: list of pages to uncharge

6454

* @page_list: list of pages to uncharge

6442

*

6455

*

6443

* Uncharge a list of pages previously charged with

6456

* Uncharge a list of pages previously charged with

6444

* mem_cgroup_try_charge() and mem_cgroup_commit_charge().

6457

* mem_cgroup_try_charge() and mem_cgroup_commit_charge().

6445

*/

6458

*/

6446

void mem_cgroup_uncharge_list(struct list_head *page_list)

6459

void mem_cgroup_uncharge_list(struct list_head *page_list)

6447

{

6460

{

6448

if (mem_cgroup_disabled())

6461

if (mem_cgroup_disabled())

6449

return;

6462

return;

6450

6463

6451

if (!list_empty(page_list))

6464

if (!list_empty(page_list))

6452

uncharge_list(page_list);

6465

uncharge_list(page_list);

6453

}

6466

}

6454

6467

6455

/**

6468

/**

6456

* mem_cgroup_migrate - migrate a charge to another page

6469

* mem_cgroup_migrate - migrate a charge to another page

6457

* @oldpage: currently charged page

6470

* @oldpage: currently charged page

6458

* @newpage: page to transfer the charge to

6471

* @newpage: page to transfer the charge to

6459

* @lrucare: both pages might be on the LRU already

6472

* @lrucare: both pages might be on the LRU already

6460

*

6473

*

6461

* Migrate the charge from @oldpage to @newpage.

6474

* Migrate the charge from @oldpage to @newpage.

6462

*

6475

*

6463

* Both pages must be locked, @newpage->mapping must be set up.

6476

* Both pages must be locked, @newpage->mapping must be set up.

6464

*/

6477

*/

6465

void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,

6478

void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,

6466

bool lrucare)

6479

bool lrucare)

6467

{

6480

{

6468

struct page_cgroup *pc;

6481

struct page_cgroup *pc;

6469

int isolated;

6482

int isolated;

6470

6483

6471

VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);

6484

VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);

6472

VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);

6485

VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);

6473

VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);

6486

VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);

6474

VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);

6487

VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);

6475

VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);

6488

VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);

6476

VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),

6489

VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),

6477

newpage);

6490

newpage);

6478

6491

6479

if (mem_cgroup_disabled())

6492

if (mem_cgroup_disabled())

6480

return;

6493

return;

6481

6494

6482

/* Page cache replacement: new page already charged? */

6495

/* Page cache replacement: new page already charged? */

6483

pc = lookup_page_cgroup(newpage);

6496

pc = lookup_page_cgroup(newpage);

6484

if (PageCgroupUsed(pc))

6497

if (PageCgroupUsed(pc))

6485

return;

6498

return;

6486

6499

6487

/* Re-entrant migration: old page already uncharged? */

6500

/* Re-entrant migration: old page already uncharged? */

6488

pc = lookup_page_cgroup(oldpage);

6501

pc = lookup_page_cgroup(oldpage);

6489

if (!PageCgroupUsed(pc))

6502

if (!PageCgroupUsed(pc))

6490

return;

6503

return;

6491

6504

6492

VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);

6505

VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);

6493

VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);

6506

VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);

6494

6507

6495

if (lrucare)

6508

if (lrucare)

6496

lock_page_lru(oldpage, &isolated);

6509

lock_page_lru(oldpage, &isolated);

6497

6510

6498

pc->flags = 0;

6511

pc->flags = 0;

6499

6512

6500

if (lrucare)

6513

if (lrucare)

6501

unlock_page_lru(oldpage, isolated);

6514

unlock_page_lru(oldpage, isolated);

6502

6515

6503

commit_charge(newpage, pc->mem_cgroup, lrucare);

6516

commit_charge(newpage, pc->mem_cgroup, lrucare);

6504

}

6517

}

6505

6518

6506

/*

6519

/*

6507

* subsys_initcall() for memory controller.

6520

* subsys_initcall() for memory controller.

6508

*

6521

*

6509

* Some parts like hotcpu_notifier() have to be initialized from this context

6522

* Some parts like hotcpu_notifier() have to be initialized from this context

6510

* because of lock dependencies (cgroup_lock -> cpu hotplug) but basically

6523

* because of lock dependencies (cgroup_lock -> cpu hotplug) but basically

6511

* everything that doesn't depend on a specific mem_cgroup structure should

6524

* everything that doesn't depend on a specific mem_cgroup structure should

6512

* be initialized from here.

6525

* be initialized from here.

6513

*/

6526

*/

6514

static int __init mem_cgroup_init(void)

6527

static int __init mem_cgroup_init(void)

6515

{

6528

{

6516

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

6529

hotcpu_notifier(memcg_cpu_hotplug_callback, 0);

6517

enable_swap_cgroup();

6530

enable_swap_cgroup();

6518

mem_cgroup_soft_limit_tree_init();

6531

mem_cgroup_soft_limit_tree_init();

6519

memcg_stock_init();

6532

memcg_stock_init();

6520

return 0;

6533

return 0;

6521

}

6534

}

GITLAB

mm: memcontrol: take a css reference for each charged page

 #ifndef _LINUX_CGROUP_H
 #define _LINUX_CGROUP_H
 /*
  *  cgroup interface
  *
  *  Copyright (C) 2003 BULL SA
  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  *
  */
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/cgroupstats.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
 #include <linux/percpu-refcount.h>
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
 #include <linux/wait.h>
 #ifdef CONFIG_CGROUPS
 struct cgroup_root;
 struct cgroup_subsys;
 struct cgroup;
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			    struct pid *pid, struct task_struct *tsk);
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
 };
 #undef SUBSYS
 /*
  * Per-subsystem/per-cgroup state maintained by the system.  This is the
  * fundamental structural building block that controllers deal with.
  *
  * Fields marked with "PI:" are public and immutable and may be accessed
  * directly without synchronization.
  */
 struct cgroup_subsys_state {
 	/* PI: the cgroup that this css is attached to */
 	struct cgroup *cgroup;
 	/* PI: the cgroup subsystem that this css is attached to */
 	struct cgroup_subsys *ss;
 	/* reference count - access via css_[try]get() and css_put() */
 	struct percpu_ref refcnt;
 	/* PI: the parent css */
 	struct cgroup_subsys_state *parent;
 	/* siblings list anchored at the parent's ->children */
 	struct list_head sibling;
 	struct list_head children;
 	/*
 	 * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
 	 * matching css can be looked up using css_from_id().
 	 */
 	int id;
 	unsigned int flags;
 	/*
 	 * Monotonically increasing unique serial number which defines a
 	 * uniform order among all csses.  It's guaranteed that all
 	 * ->children lists are in the ascending order of ->serial_nr and
 	 * used to allow interrupting and resuming iterations.
 	 */
 	u64 serial_nr;
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
 };
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 	CSS_RELEASED	= (1 << 2), /* refcnt reached zero, released */
 };
 /**
  * css_get - obtain a reference on the specified css
  * @css: target css
  *
  * The caller must already have a reference.
  */
 static inline void css_get(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_NO_REF))
 		percpu_ref_get(&css->refcnt);
 }
 /**
+ * css_get_many - obtain references on the specified css
+ * @css: target css
+ * @n: number of references to get
+ *
+ * The caller must already have a reference.
+ */
+static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
+{
+	if (!(css->flags & CSS_NO_REF))
+		percpu_ref_get_many(&css->refcnt, n);
+}
+/**
  * css_tryget - try to obtain a reference on the specified css
  * @css: target css
  *
  * Obtain a reference on @css unless it already has reached zero and is
  * being released.  This function doesn't care whether @css is on or
  * offline.  The caller naturally needs to ensure that @css is accessible
  * but doesn't have to be holding a reference on it - IOW, RCU protected
  * access is good enough for this function.  Returns %true if a reference
  * count was successfully obtained; %false otherwise.
  */
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_NO_REF))
 		return percpu_ref_tryget(&css->refcnt);
 	return true;
 }
 /**
  * css_tryget_online - try to obtain a reference on the specified css if online
  * @css: target css
  *
  * Obtain a reference on @css if it's online.  The caller naturally needs
  * to ensure that @css is accessible but doesn't have to be holding a
  * reference on it - IOW, RCU protected access is good enough for this
  * function.  Returns %true if a reference count was successfully obtained;
  * %false otherwise.
  */
 static inline bool css_tryget_online(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_NO_REF))
 		return percpu_ref_tryget_live(&css->refcnt);
 	return true;
 }
 /**
  * css_put - put a css reference
  * @css: target css
  *
  * Put a reference obtained via css_get() and css_tryget_online().
  */
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_NO_REF))
 		percpu_ref_put(&css->refcnt);
+}
+/**
+ * css_put_many - put css references
+ * @css: target css
+ * @n: number of references to put
+ *
+ * Put references obtained via css_get() and css_tryget_online().
+ */
+static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
+{
+	if (!(css->flags & CSS_NO_REF))
+		percpu_ref_put_many(&css->refcnt, n);
 }
 /* bits in struct cgroup flags field */
 enum {
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
 	/*
 	 * Clone the parent's configuration when creating a new child
 	 * cpuset cgroup.  For historical reasons, this option can be
 	 * specified at mount time and thus is implemented here.
 	 */
 	CGRP_CPUSET_CLONE_CHILDREN,
 };
 struct cgroup {
 	/* self css with NULL ->ss, points back to this cgroup */
 	struct cgroup_subsys_state self;
 	unsigned long flags;		/* "unsigned long" so bitops work */
 	/*
 	 * idr allocated in-hierarchy ID.
 	 *
 	 * ID 0 is not used, the ID of the root cgroup is always 1, and a
 	 * new cgroup will be assigned with a smallest available ID.
 	 *
 	 * Allocating/Removing ID must be protected by cgroup_mutex.
 	 */
 	int id;
 	/*
 	 * If this cgroup contains any tasks, it contributes one to
 	 * populated_cnt.  All children with non-zero popuplated_cnt of
 	 * their own contribute one.  The count is zero iff there's no task
 	 * in this cgroup or its subtree.
 	 */
 	int populated_cnt;
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
 	/*
 	 * The bitmask of subsystems enabled on the child cgroups.
 	 * ->subtree_control is the one configured through
 	 * "cgroup.subtree_control" while ->child_subsys_mask is the
 	 * effective one which may have more subsystems enabled.
 	 * Controller knobs are made available iff it's enabled in
 	 * ->subtree_control.
 	 */
 	unsigned int subtree_control;
 	unsigned int child_subsys_mask;
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
 	struct cgroup_root *root;
 	/*
 	 * List of cgrp_cset_links pointing at css_sets with tasks in this
 	 * cgroup.  Protected by css_set_lock.
 	 */
 	struct list_head cset_links;
 	/*
 	 * On the default hierarchy, a css_set for a cgroup with some
 	 * susbsys disabled will point to css's which are associated with
 	 * the closest ancestor which has the subsys enabled.  The
 	 * following lists all css_sets which point to this cgroup's css
 	 * for the given subsystem.
 	 */
 	struct list_head e_csets[CGROUP_SUBSYS_COUNT];
 	/*
 	 * list of pidlists, up to two for each namespace (one for procs, one
 	 * for tasks); created on demand.
 	 */
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 	/* used to wait for offlining of csses */
 	wait_queue_head_t offline_waitq;
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 };
 #define MAX_CGROUP_ROOT_NAMELEN 64
 /* cgroup_root->flags */
 enum {
 	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0), /* __DEVEL__sane_behavior specified */
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
 };
 /*
  * A cgroup_root represents the root of a cgroup hierarchy, and may be
  * associated with a kernfs_root to form an active hierarchy.  This is
  * internal to cgroup core.  Don't access directly from controllers.
  */
 struct cgroup_root {
 	struct kernfs_root *kf_root;
 	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned int subsys_mask;
 	/* Unique id for this hierarchy. */
 	int hierarchy_id;
 	/* The root cgroup.  Root is destroyed on its release. */
 	struct cgroup cgrp;
 	/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
 	atomic_t nr_cgrps;
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
 	/* Hierarchy-specific flags */
 	unsigned int flags;
 	/* IDs for cgroups in this hierarchy */
 	struct idr cgroup_idr;
 	/* The path to use for release notifications. */
 	char release_agent_path[PATH_MAX];
 	/* The name for this hierarchy - may be empty */
 	char name[MAX_CGROUP_ROOT_NAMELEN];
 };
 /*
  * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
  * object and speeds up fork()/exit(), since a single inc/dec and a
  * list_add()/del() can bump the reference count on the entire cgroup
  * set for a task.
  */
 struct css_set {
 	/* Reference count */
 	atomic_t refcount;
 	/*
 	 * List running through all cgroup groups in the same hash
 	 * slot. Protected by css_set_lock
 	 */
 	struct hlist_node hlist;
 	/*
 	 * Lists running through all tasks using this cgroup group.
 	 * mg_tasks lists tasks which belong to this cset but are in the
 	 * process of being migrated out or in.  Protected by
 	 * css_set_rwsem, but, during migration, once tasks are moved to
 	 * mg_tasks, it can be read safely while holding cgroup_mutex.
 	 */
 	struct list_head tasks;
 	struct list_head mg_tasks;
 	/*
 	 * List of cgrp_cset_links pointing at cgroups referenced from this
 	 * css_set.  Protected by css_set_lock.
 	 */
 	struct list_head cgrp_links;
 	/* the default cgroup associated with this css_set */
 	struct cgroup *dfl_cgrp;
 	/*
 	 * Set of subsystem states, one for each subsystem. This array is
 	 * immutable after creation apart from the init_css_set during
 	 * subsystem registration (at boot time).
 	 */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 	/*
 	 * List of csets participating in the on-going migration either as
 	 * source or destination.  Protected by cgroup_mutex.
 	 */
 	struct list_head mg_preload_node;
 	struct list_head mg_node;
 	/*
 	 * If this cset is acting as the source of migration the following
 	 * two fields are set.  mg_src_cgrp is the source cgroup of the
 	 * on-going migration and mg_dst_cset is the destination cset the
 	 * target tasks on this cset should be migrated to.  Protected by
 	 * cgroup_mutex.
 	 */
 	struct cgroup *mg_src_cgrp;
 	struct css_set *mg_dst_cset;
 	/*
 	 * On the default hierarhcy, ->subsys[ssid] may point to a css
 	 * attached to an ancestor instead of the cgroup this css_set is
 	 * associated with.  The following node is anchored at
 	 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
 	 * iterate through all css's attached to a given cgroup.
 	 */
 	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 };
 /*
  * struct cftype: handler definitions for cgroup control files
  *
  * When reading/writing to a file:
  *	- the cgroup to use is file->f_dentry->d_parent->d_fsdata
  *	- the 'cftype' of the file is file->f_dentry->d_fsdata
  */
 /* cftype->flags */
 enum {
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
 	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
 };
 #define MAX_CFTYPE_NAME		64
 struct cftype {
 	/*
 	 * By convention, the name should begin with the name of the
 	 * subsystem, followed by a period.  Zero length string indicates
 	 * end of cftype array.
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
 	/*
 	 * If not 0, file mode is set to this value, otherwise it will
 	 * be figured out automatically
 	 */
 	umode_t mode;
 	/*
 	 * The maximum length of string, excluding trailing nul, that can
 	 * be passed to write.  If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
 	 */
 	size_t max_write_len;
 	/* CFTYPE_* flags */
 	unsigned int flags;
 	/*
 	 * Fields used for internal bookkeeping.  Initialized automatically
 	 * during registration.
 	 */
 	struct cgroup_subsys *ss;	/* NULL for cgroup core files */
 	struct list_head node;		/* anchored at ss->cfts */
 	struct kernfs_ops *kf_ops;
 	/*
 	 * read_u64() is a shortcut for the common case of returning a
 	 * single integer. Use it in place of read()
 	 */
 	u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
 	/*
 	 * read_s64() is a signed version of read_u64()
 	 */
 	s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
 	/* generic seq_file read interface */
 	int (*seq_show)(struct seq_file *sf, void *v);
 	/* optional ops, implement all or none */
 	void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
 	void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
 	void (*seq_stop)(struct seq_file *sf, void *v);
 	/*
 	 * write_u64() is a shortcut for the common case of accepting
 	 * a single integer (as parsed by simple_strtoull) from
 	 * userspace. Use in place of write(); return 0 or error.
 	 */
 	int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
 			 u64 val);
 	/*
 	 * write_s64() is a signed version of write_u64()
 	 */
 	int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
 			 s64 val);
 	/*
 	 * write() is the generic write callback which maps directly to
 	 * kernfs write operation and overrides all other operations.
 	 * Maximum write size is determined by ->max_write_len.  Use
 	 * of_css/cft() to access the associated css and cft.
 	 */
 	ssize_t (*write)(struct kernfs_open_file *of,
 			 char *buf, size_t nbytes, loff_t off);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lock_class_key	lockdep_key;
 #endif
 };
 extern struct cgroup_root cgrp_dfl_root;
 extern struct css_set init_css_set;
 /**
  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
  * @cgrp: the cgroup of interest
  *
  * The default hierarchy is the v2 interface of cgroup and this function
  * can be used to test whether a cgroup is on the default hierarchy for
  * cases where a subsystem should behave differnetly depending on the
  * interface version.
  *
  * The set of behaviors which change on the default hierarchy are still
  * being determined and the mount option is prefixed with __DEVEL__.
  *
  * List of changed behaviors:
  *
  * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
  *   and "name" are disallowed.
  *
  * - When mounting an existing superblock, mount options should match.
  *
  * - Remount is disallowed.
  *
  * - rename(2) is disallowed.
  *
  * - "tasks" is removed.  Everything should be at process granularity.  Use
  *   "cgroup.procs" instead.
  *
  * - "cgroup.procs" is not sorted.  pids will be unique unless they got
  *   recycled inbetween reads.
  *
  * - "release_agent" and "notify_on_release" are removed.  Replacement
  *   notification mechanism will be implemented.
  *
  * - "cgroup.clone_children" is removed.
  *
  * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
  *   and its descendants contain no task; otherwise, 1.  The file also
  *   generates kernfs notification which can be monitored through poll and
  *   [di]notify when the value of the file changes.
  *
  * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
  *   take masks of ancestors with non-empty cpus/mems, instead of being
  *   moved to an ancestor.
  *
  * - cpuset: a task can be moved into an empty cpuset, and again it takes
  *   masks of ancestors.
  *
  * - memcg: use_hierarchy is on by default and the cgroup file for the flag
  *   is not created.
  *
  * - blkcg: blk-throttle becomes properly hierarchical.
  *
  * - debug: disallowed on the default hierarchy.
  */
 static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
 {
 	return cgrp->root == &cgrp_dfl_root;
 }
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_has_tasks(struct cgroup *cgrp)
 {
 	return !list_empty(&cgrp->cset_links);
 }
 /* returns ino associated with a cgroup */
 static inline ino_t cgroup_ino(struct cgroup *cgrp)
 {
 	return cgrp->kn->ino;
 }
 /* cft/css accessors for cftype->write() operation */
 static inline struct cftype *of_cft(struct kernfs_open_file *of)
 {
 	return of->kn->priv;
 }
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
 /* cft/css accessors for cftype->seq_*() operations */
 static inline struct cftype *seq_cft(struct seq_file *seq)
 {
 	return of_cft(seq->private);
 }
 static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
 {
 	return of_css(seq->private);
 }
 /*
  * Name / path handling functions.  All are thin wrappers around the kernfs
  * counterparts and can be called under any context.
  */
 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
 {
 	return kernfs_name(cgrp->kn, buf, buflen);
 }
 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
 					      size_t buflen)
 {
 	return kernfs_path(cgrp->kn, buf, buflen);
 }
 static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
 {
 	pr_cont_kernfs_name(cgrp->kn);
 }
 static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 {
 	pr_cont_kernfs_path(cgrp->kn);
 }
 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cftype *cfts);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 /*
  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
  * methods.
  */
 struct cgroup_taskset;
 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
 /**
  * cgroup_taskset_for_each - iterate cgroup_taskset
  * @task: the loop cursor
  * @tset: taskset to iterate
  */
 #define cgroup_taskset_for_each(task, tset)				\
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
 	     (task) = cgroup_taskset_next((tset)))
 /*
  * Control Group subsystem type.
  * See Documentation/cgroups/cgroups.txt for details
  */
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
 	int (*css_online)(struct cgroup_subsys_state *css);
 	void (*css_offline)(struct cgroup_subsys_state *css);
 	void (*css_free)(struct cgroup_subsys_state *css);
 	void (*css_reset)(struct cgroup_subsys_state *css);
 	int (*can_attach)(struct cgroup_subsys_state *css,
 			  struct cgroup_taskset *tset);
 	void (*cancel_attach)(struct cgroup_subsys_state *css,
 			      struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_subsys_state *css,
 		       struct cgroup_taskset *tset);
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup_subsys_state *css,
 		     struct cgroup_subsys_state *old_css,
 		     struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 	int disabled;
 	int early_init;
 	/*
 	 * If %false, this subsystem is properly hierarchical -
 	 * configuration, resource accounting and restriction on a parent
 	 * cgroup cover those of its children.  If %true, hierarchy support
 	 * is broken in some ways - some subsystems ignore hierarchy
 	 * completely while others are only implemented half-way.
 	 *
 	 * It's now disallowed to create nested cgroups if the subsystem is
 	 * broken and cgroup core will emit a warning message on such
 	 * cases.  Eventually, all subsystems will be made properly
 	 * hierarchical and this will go away.
 	 */
 	bool broken_hierarchy;
 	bool warned_broken_hierarchy;
 	/* the following two fields are initialized automtically during boot */
 	int id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroup_root *root;
 	/* idr for css->id */
 	struct idr css_idr;
 	/*
 	 * List of cftypes.  Each entry is the first entry of an array
 	 * terminated by zero length name.
 	 */
 	struct list_head cfts;
 	/*
 	 * Base cftypes which are automatically registered.  The two can
 	 * point to the same array.
 	 */
 	struct cftype *dfl_cftypes;	/* for the default hierarchy */
 	struct cftype *legacy_cftypes;	/* for the legacy hierarchies */
 	/*
 	 * A subsystem may depend on other subsystems.  When such subsystem
 	 * is enabled on a cgroup, the depended-upon subsystems are enabled
 	 * together if available.  Subsystems enabled due to dependency are
 	 * not visible to userland until explicitly enabled.  The following
 	 * specifies the mask of subsystems that this one depends on.
 	 */
 	unsigned int depends_on;
 };
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 /**
  * task_css_set_check - obtain a task's css_set with extra access conditions
  * @task: the task to obtain css_set for
  * @__c: extra condition expression to be passed to rcu_dereference_check()
  *
  * A task's css_set is RCU protected, initialized and exited while holding
  * task_lock(), and can only be modified while holding both cgroup_mutex
  * and task_lock() while the task is alive.  This macro verifies that the
  * caller is inside proper critical section and returns @task's css_set.
  *
  * The caller can also specify additional allowed conditions via @__c, such
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
 extern struct rw_semaphore css_set_rwsem;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		lockdep_is_held(&cgroup_mutex) ||			\
 		lockdep_is_held(&css_set_rwsem) ||			\
 		((task)->flags & PF_EXITING) || (__c))
 #else
 #define task_css_set_check(task, __c)					\
 	rcu_dereference((task)->cgroups)
 #endif
 /**
  * task_css_check - obtain css for (task, subsys) w/ extra access conds
  * @task: the target task
  * @subsys_id: the target subsystem ID
  * @__c: extra condition expression to be passed to rcu_dereference_check()
  *
  * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
  * synchronization rules are the same as task_css_set_check().
  */
 #define task_css_check(task, subsys_id, __c)				\
 	task_css_set_check((task), (__c))->subsys[(subsys_id)]
 /**
  * task_css_set - obtain a task's css_set
  * @task: the task to obtain css_set for
  *
  * See task_css_set_check().
  */
 static inline struct css_set *task_css_set(struct task_struct *task)
 {
 	return task_css_set_check(task, false);
 }
 /**
  * task_css - obtain css for (task, subsys)
  * @task: the target task
  * @subsys_id: the target subsystem ID
  *
  * See task_css_check().
  */
 static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
 						   int subsys_id)
 {
 	return task_css_check(task, subsys_id, false);
 }
 /**
  * task_css_is_root - test whether a task belongs to the root css
  * @task: the target task
  * @subsys_id: the target subsystem ID
  *
  * Test whether @task belongs to the root css on the specified subsystem.
  * May be invoked in any context.
  */
 static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
 {
 	return task_css_check(task, subsys_id, true) ==
 		init_css_set.subsys[subsys_id];
 }
 static inline struct cgroup *task_cgroup(struct task_struct *task,
 					 int subsys_id)
 {
 	return task_css(task, subsys_id)->cgroup;
 }
 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
 					   struct cgroup_subsys_state *parent);
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
 /**
  * css_for_each_child - iterate through children of a css
  * @pos: the css * to use as the loop cursor
  * @parent: css whose children to walk
  *
  * Walk @parent's children.  Must be called under rcu_read_lock().
  *
  * If a subsystem synchronizes ->css_online() and the start of iteration, a
  * css which finished ->css_online() is guaranteed to be visible in the
  * future iterations and will stay visible until the last reference is put.
  * A css which hasn't finished ->css_online() or already finished
  * ->css_offline() may show up during traversal.  It's each subsystem's
  * responsibility to synchronize against on/offlining.
  *
  * It is allowed to temporarily drop RCU read lock during iteration.  The
  * caller is responsible for ensuring that @pos remains accessible until
  * the start of the next iteration by, for example, bumping the css refcnt.
  */
 #define css_for_each_child(pos, parent)					\
 	for ((pos) = css_next_child(NULL, (parent)); (pos);		\
 	     (pos) = css_next_child((pos), (parent)))
 struct cgroup_subsys_state *
 css_next_descendant_pre(struct cgroup_subsys_state *pos,
 			struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *
 css_rightmost_descendant(struct cgroup_subsys_state *pos);
 /**
  * css_for_each_descendant_pre - pre-order walk of a css's descendants
  * @pos: the css * to use as the loop cursor
  * @root: css whose descendants to walk
  *
  * Walk @root's descendants.  @root is included in the iteration and the
  * first node to be visited.  Must be called under rcu_read_lock().
  *
  * If a subsystem synchronizes ->css_online() and the start of iteration, a
  * css which finished ->css_online() is guaranteed to be visible in the
  * future iterations and will stay visible until the last reference is put.
  * A css which hasn't finished ->css_online() or already finished
  * ->css_offline() may show up during traversal.  It's each subsystem's
  * responsibility to synchronize against on/offlining.
  *
  * For example, the following guarantees that a descendant can't escape
  * state updates of its ancestors.
  *
  * my_online(@css)
  * {
  *	Lock @css's parent and @css;
  *	Inherit state from the parent;
  *	Unlock both.
  * }
  *
  * my_update_state(@css)
  * {
  *	css_for_each_descendant_pre(@pos, @css) {
  *		Lock @pos;
  *		if (@pos == @css)
  *			Update @css's state;
  *		else
  *			Verify @pos is alive and inherit state from its parent;
  *		Unlock @pos;
  *	}
  * }
  *
  * As long as the inheriting step, including checking the parent state, is
  * enclosed inside @pos locking, double-locking the parent isn't necessary
  * while inheriting.  The state update to the parent is guaranteed to be
  * visible by walking order and, as long as inheriting operations to the
  * same @pos are atomic to each other, multiple updates racing each other
  * still result in the correct state.  It's guaranateed that at least one
  * inheritance happens for any css after the latest update to its parent.
  *
  * If checking parent's state requires locking the parent, each inheriting
  * iteration should lock and unlock both @pos->parent and @pos.
  *
  * Alternatively, a subsystem may choose to use a single global lock to
  * synchronize ->css_online() and ->css_offline() against tree-walking
  * operations.
  *
  * It is allowed to temporarily drop RCU read lock during iteration.  The
  * caller is responsible for ensuring that @pos remains accessible until
  * the start of the next iteration by, for example, bumping the css refcnt.
  */
 #define css_for_each_descendant_pre(pos, css)				\
 	for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);	\
 	     (pos) = css_next_descendant_pre((pos), (css)))
 struct cgroup_subsys_state *
 css_next_descendant_post(struct cgroup_subsys_state *pos,
 			 struct cgroup_subsys_state *css);
 /**
  * css_for_each_descendant_post - post-order walk of a css's descendants
  * @pos: the css * to use as the loop cursor
  * @css: css whose descendants to walk
  *
  * Similar to css_for_each_descendant_pre() but performs post-order
  * traversal instead.  @root is included in the iteration and the last
  * node to be visited.
  *
  * If a subsystem synchronizes ->css_online() and the start of iteration, a
  * css which finished ->css_online() is guaranteed to be visible in the
  * future iterations and will stay visible until the last reference is put.
  * A css which hasn't finished ->css_online() or already finished
  * ->css_offline() may show up during traversal.  It's each subsystem's
  * responsibility to synchronize against on/offlining.
  *
  * Note that the walk visibility guarantee example described in pre-order
  * walk doesn't apply the same to post-order walks.
  */
 #define css_for_each_descendant_post(pos, css)				\
 	for ((pos) = css_next_descendant_post(NULL, (css)); (pos);	\
 	     (pos) = css_next_descendant_post((pos), (css)))
 bool css_has_online_children(struct cgroup_subsys_state *css);
 /* A css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
 	struct list_head		*cset_pos;
 	struct list_head		*cset_head;
 	struct list_head		*task_pos;
 	struct list_head		*tasks_head;
 	struct list_head		*mg_tasks_head;
 };
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it);
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
 						       struct cgroup_subsys *ss);
 #else /* !CONFIG_CGROUPS */
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_fork(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
 	return -EINVAL;
 }
 /* No cgroups - nothing to do */
 static inline int cgroup_attach_task_all(struct task_struct *from,
 					 struct task_struct *t)
 {
 	return 0;
 }
 #endif /* !CONFIG_CGROUPS */
 #endif /* _LINUX_CGROUP_H */

 /*
  * Percpu refcounts:
  * (C) 2012 Google, Inc.
  * Author: Kent Overstreet <koverstreet@google.com>
  *
  * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
  * atomic_dec_and_test() - but percpu.
  *
  * There's one important difference between percpu refs and normal atomic_t
  * refcounts; you have to keep track of your initial refcount, and then when you
  * start shutting down you call percpu_ref_kill() _before_ dropping the initial
  * refcount.
  *
  * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
  * than an atomic_t - this is because of the way shutdown works, see
  * percpu_ref_kill()/PERCPU_COUNT_BIAS.
  *
  * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
  * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
  * puts the ref back in single atomic_t mode, collecting the per cpu refs and
  * issuing the appropriate barriers, and then marks the ref as shutting down so
  * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
  * it's safe to drop the initial ref.
  *
  * USAGE:
  *
  * See fs/aio.c for some example usage; it's used there for struct kioctx, which
  * is created when userspaces calls io_setup(), and destroyed when userspace
  * calls io_destroy() or the process exits.
  *
  * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
  * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove
  * the kioctx from the proccess's list of kioctxs - after that, there can't be
  * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
  * the initial ref with percpu_ref_put().
  *
  * Code that does a two stage shutdown like this often needs some kind of
  * explicit synchronization to ensure the initial refcount can only be dropped
  * once - percpu_ref_kill() does this for you, it returns true once and false if
  * someone else already called it. The aio code uses it this way, but it's not
  * necessary if the code has some other mechanism to synchronize teardown.
  * around.
  */
 #ifndef _LINUX_PERCPU_REFCOUNT_H
 #define _LINUX_PERCPU_REFCOUNT_H
 #include <linux/atomic.h>
 #include <linux/kernel.h>
 #include <linux/percpu.h>
 #include <linux/rcupdate.h>
 #include <linux/gfp.h>
 struct percpu_ref;
 typedef void (percpu_ref_func_t)(struct percpu_ref *);
 /* flags set in the lower bits of percpu_ref->percpu_count_ptr */
 enum {
 	__PERCPU_REF_ATOMIC	= 1LU << 0,	/* operating in atomic mode */
 	__PERCPU_REF_DEAD	= 1LU << 1,	/* (being) killed */
 	__PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,
 	__PERCPU_REF_FLAG_BITS	= 2,
 };
 /* @flags for percpu_ref_init() */
 enum {
 	/*
 	 * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
 	 * operation using percpu_ref_switch_to_percpu().  If initialized
 	 * with this flag, the ref will stay in atomic mode until
 	 * percpu_ref_switch_to_percpu() is invoked on it.
 	 */
 	PERCPU_REF_INIT_ATOMIC	= 1 << 0,
 	/*
 	 * Start dead w/ ref == 0 in atomic mode.  Must be revived with
 	 * percpu_ref_reinit() before used.  Implies INIT_ATOMIC.
 	 */
 	PERCPU_REF_INIT_DEAD	= 1 << 1,
 };
 struct percpu_ref {
 	atomic_long_t		count;
 	/*
 	 * The low bit of the pointer indicates whether the ref is in percpu
 	 * mode; if set, then get/put will manipulate the atomic_t.
 	 */
 	unsigned long		percpu_count_ptr;
 	percpu_ref_func_t	*release;
 	percpu_ref_func_t	*confirm_switch;
 	bool			force_atomic:1;
 	struct rcu_head		rcu;
 };
 int __must_check percpu_ref_init(struct percpu_ref *ref,
 				 percpu_ref_func_t *release, unsigned int flags,
 				 gfp_t gfp);
 void percpu_ref_exit(struct percpu_ref *ref);
 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_switch);
 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 				 percpu_ref_func_t *confirm_kill);
 void percpu_ref_reinit(struct percpu_ref *ref);
 /**
  * percpu_ref_kill - drop the initial ref
  * @ref: percpu_ref to kill
  *
  * Must be used to drop the initial ref on a percpu refcount; must be called
  * precisely once before shutdown.
  *
  * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
  * percpu counters and dropping the initial ref.
  */
 static inline void percpu_ref_kill(struct percpu_ref *ref)
 {
 	return percpu_ref_kill_and_confirm(ref, NULL);
 }
 /*
  * Internal helper.  Don't use outside percpu-refcount proper.  The
  * function doesn't return the pointer and let the caller test it for NULL
  * because doing so forces the compiler to generate two conditional
  * branches as it can't assume that @ref->percpu_count is not NULL.
  */
 static inline bool __ref_is_percpu(struct percpu_ref *ref,
 					  unsigned long __percpu **percpu_countp)
 {
 	unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr);
 	/* paired with smp_store_release() in percpu_ref_reinit() */
 	smp_read_barrier_depends();
 	/*
 	 * Theoretically, the following could test just ATOMIC; however,
 	 * then we'd have to mask off DEAD separately as DEAD may be
 	 * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
 	 * implies ATOMIC anyway.  Test them together.
 	 */
 	if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
 		return false;
 	*percpu_countp = (unsigned long __percpu *)percpu_ptr;
 	return true;
 }
 /**
- * percpu_ref_get - increment a percpu refcount
+ * percpu_ref_get_many - increment a percpu refcount
  * @ref: percpu_ref to get
+ * @nr: number of references to get
  *
- * Analagous to atomic_long_inc().
+ * Analogous to atomic_long_add().
  *
  * This function is safe to call as long as @ref is between init and exit.
  */
-static inline void percpu_ref_get(struct percpu_ref *ref)
+static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
 {
 	unsigned long __percpu *percpu_count;
 	rcu_read_lock_sched();
 	if (__ref_is_percpu(ref, &percpu_count))
-		this_cpu_inc(*percpu_count);
+		this_cpu_add(*percpu_count, nr);
 	else
-		atomic_long_inc(&ref->count);
+		atomic_long_add(nr, &ref->count);
 	rcu_read_unlock_sched();
 }
 /**
+ * percpu_ref_get - increment a percpu refcount
+ * @ref: percpu_ref to get
+ *
+ * Analagous to atomic_long_inc().
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline void percpu_ref_get(struct percpu_ref *ref)
+{
+	percpu_ref_get_many(ref, 1);
+}
+/**
  * percpu_ref_tryget - try to increment a percpu refcount
  * @ref: percpu_ref to try-get
  *
  * Increment a percpu refcount unless its count already reached zero.
  * Returns %true on success; %false on failure.
  *
  * This function is safe to call as long as @ref is between init and exit.
  */
 static inline bool percpu_ref_tryget(struct percpu_ref *ref)
 {
 	unsigned long __percpu *percpu_count;
 	int ret;
 	rcu_read_lock_sched();
 	if (__ref_is_percpu(ref, &percpu_count)) {
 		this_cpu_inc(*percpu_count);
 		ret = true;
 	} else {
 		ret = atomic_long_inc_not_zero(&ref->count);
 	}
 	rcu_read_unlock_sched();
 	return ret;
 }
 /**
  * percpu_ref_tryget_live - try to increment a live percpu refcount
  * @ref: percpu_ref to try-get
  *
  * Increment a percpu refcount unless it has already been killed.  Returns
  * %true on success; %false on failure.
  *
  * Completion of percpu_ref_kill() in itself doesn't guarantee that this
  * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
  * should be used.  After the confirm_kill callback is invoked, it's
  * guaranteed that no new reference will be given out by
  * percpu_ref_tryget_live().
  *
  * This function is safe to call as long as @ref is between init and exit.
  */
 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
 {
 	unsigned long __percpu *percpu_count;
 	int ret = false;
 	rcu_read_lock_sched();
 	if (__ref_is_percpu(ref, &percpu_count)) {
 		this_cpu_inc(*percpu_count);
 		ret = true;
 	} else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) {
 		ret = atomic_long_inc_not_zero(&ref->count);
 	}
 	rcu_read_unlock_sched();
 	return ret;
 }
 /**
- * percpu_ref_put - decrement a percpu refcount
+ * percpu_ref_put_many - decrement a percpu refcount
  * @ref: percpu_ref to put
+ * @nr: number of references to put
  *
  * Decrement the refcount, and if 0, call the release function (which was passed
  * to percpu_ref_init())
  *
  * This function is safe to call as long as @ref is between init and exit.
  */
-static inline void percpu_ref_put(struct percpu_ref *ref)
+static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
 {
 	unsigned long __percpu *percpu_count;
 	rcu_read_lock_sched();
 	if (__ref_is_percpu(ref, &percpu_count))
-		this_cpu_dec(*percpu_count);
+		this_cpu_sub(*percpu_count, nr);
-	else if (unlikely(atomic_long_dec_and_test(&ref->count)))
+	else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
 		ref->release(ref);
 	rcu_read_unlock_sched();
+}
+/**
+ * percpu_ref_put - decrement a percpu refcount
+ * @ref: percpu_ref to put
+ *
+ * Decrement the refcount, and if 0, call the release function (which was passed
+ * to percpu_ref_init())
+ *
+ * This function is safe to call as long as @ref is between init and exit.
+ */
+static inline void percpu_ref_put(struct percpu_ref *ref)
+{
+	percpu_ref_put_many(ref, 1);
 }
 /**
  * percpu_ref_is_zero - test whether a percpu refcount reached zero
  * @ref: percpu_ref to test
  *
  * Returns %true if @ref reached zero.
  *
  * This function is safe to call as long as @ref is between init and exit.
  */
 static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
 {
 	unsigned long __percpu *percpu_count;
 	if (__ref_is_percpu(ref, &percpu_count))
 		return false;
 	return !atomic_long_read(&ref->count);
 }
 #endif

 /* memcontrol.c - Memory Controller
  *
  * Copyright IBM Corporation, 2007
  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  *
  * Copyright 2007 OpenVZ SWsoft Inc
  * Author: Pavel Emelianov <xemul@openvz.org>
  *
  * Memory thresholds
  * Copyright (C) 2009 Nokia Corporation
  * Author: Kirill A. Shutemov
  *
  * Kernel Memory Controller
  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  * Authors: Glauber Costa and Suleiman Souhlal
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
  * (at your option) any later version.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  */
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
 #include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
 #include <linux/file.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/tcp_memcontrol.h>
 #include "slab.h"
 #include <asm/uaccess.h>
 #include <trace/events/vmscan.h>
 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
 EXPORT_SYMBOL(memory_cgrp_subsys);
 #define MEM_CGROUP_RECLAIM_RETRIES	5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
 #ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
 #ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata;
 #endif
 #else
 #define do_swap_account		0
 #endif
 static const char * const mem_cgroup_stat_names[] = {
 	"cache",
 	"rss",
 	"rss_huge",
 	"mapped_file",
 	"writeback",
 	"swap",
 };
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
 };
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
 	"pgfault",
 	"pgmajfault",
 };
 static const char * const mem_cgroup_lru_names[] = {
 	"inactive_anon",
 	"active_anon",
 	"inactive_file",
 	"active_file",
 	"unevictable",
 };
 /*
  * Per memcg event counter is incremented at every pagein/pageout. With THP,
  * it will be incremated by the number of pages. This counter is used for
  * for trigger some periodic events. This is straightforward and better
  * than using jiffies etc. to handle periodic memcg event.
  */
 enum mem_cgroup_events_target {
 	MEM_CGROUP_TARGET_THRESH,
 	MEM_CGROUP_TARGET_SOFTLIMIT,
 	MEM_CGROUP_TARGET_NUMAINFO,
 	MEM_CGROUP_NTARGETS,
 };
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 #define NUMAINFO_EVENTS_TARGET	1024
 struct mem_cgroup_stat_cpu {
 	long count[MEM_CGROUP_STAT_NSTATS];
 	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
 	unsigned long nr_page_events;
 	unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 struct reclaim_iter {
 	struct mem_cgroup *position;
 	/* scan generation, increased every round-trip */
 	unsigned int generation;
 };
 /*
  * per-zone information in memory controller.
  */
 struct mem_cgroup_per_zone {
 	struct lruvec		lruvec;
 	unsigned long		lru_size[NR_LRU_LISTS];
 	struct reclaim_iter	iter[DEF_PRIORITY + 1];
 	struct rb_node		tree_node;	/* RB tree node */
 	unsigned long		usage_in_excess;/* Set to the value by which */
 						/* the soft limit is exceeded*/
 	bool			on_tree;
 	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
 						/* use container_of	   */
 };
 struct mem_cgroup_per_node {
 	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 };
 /*
  * Cgroups above their limits are maintained in a RB-Tree, independent of
  * their hierarchy representation
  */
 struct mem_cgroup_tree_per_zone {
 	struct rb_root rb_root;
 	spinlock_t lock;
 };
 struct mem_cgroup_tree_per_node {
 	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 };
 struct mem_cgroup_tree {
 	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 };
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 struct mem_cgroup_threshold {
 	struct eventfd_ctx *eventfd;
 	unsigned long threshold;
 };
 /* For threshold */
 struct mem_cgroup_threshold_ary {
 	/* An array index points to threshold just below or equal to usage. */
 	int current_threshold;
 	/* Size of entries[] */
 	unsigned int size;
 	/* Array of thresholds */
 	struct mem_cgroup_threshold entries[0];
 };
 struct mem_cgroup_thresholds {
 	/* Primary thresholds array */
 	struct mem_cgroup_threshold_ary *primary;
 	/*
 	 * Spare threshold array.
 	 * This is needed to make mem_cgroup_unregister_event() "never fail".
 	 * It must be able to store at least primary->size - 1 entries.
 	 */
 	struct mem_cgroup_threshold_ary *spare;
 };
 /* for OOM */
 struct mem_cgroup_eventfd_list {
 	struct list_head list;
 	struct eventfd_ctx *eventfd;
 };
 /*
  * cgroup_event represents events which userspace want to receive.
  */
 struct mem_cgroup_event {
 	/*
 	 * memcg which the event belongs to.
 	 */
 	struct mem_cgroup *memcg;
 	/*
 	 * eventfd to signal userspace about the event.
 	 */
 	struct eventfd_ctx *eventfd;
 	/*
 	 * Each of these stored in a list by the cgroup.
 	 */
 	struct list_head list;
 	/*
 	 * register_event() callback will be used to add new userspace
 	 * waiter for changes related to this event.  Use eventfd_signal()
 	 * on eventfd to send notification to userspace.
 	 */
 	int (*register_event)(struct mem_cgroup *memcg,
 			      struct eventfd_ctx *eventfd, const char *args);
 	/*
 	 * unregister_event() callback will be called when userspace closes
 	 * the eventfd or on cgroup removing.  This callback must be set,
 	 * if you want provide notification functionality.
 	 */
 	void (*unregister_event)(struct mem_cgroup *memcg,
 				 struct eventfd_ctx *eventfd);
 	/*
 	 * All fields below needed to unregister event when
 	 * userspace closes eventfd.
 	 */
 	poll_table pt;
 	wait_queue_head_t *wqh;
 	wait_queue_t wait;
 	struct work_struct remove;
 };
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  * to help the administrator determine what knobs to tune.
  *
  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  * we hit the water mark. May be even add a low water mark, such that
  * no reclaim occurs from a cgroup at it's low water mark, this is
  * a feature that will be implemented much later in the future.
  */
 struct mem_cgroup {
 	struct cgroup_subsys_state css;
 	/* Accounted resources */
 	struct page_counter memory;
 	struct page_counter memsw;
 	struct page_counter kmem;
 	unsigned long soft_limit;
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
 	/* css_online() has been completed */
 	int initialized;
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
 	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	oom_wakeups;
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 	/* thresholds for memory usage. RCU-protected */
 	struct mem_cgroup_thresholds thresholds;
 	/* thresholds for mem+swap usage. RCU-protected */
 	struct mem_cgroup_thresholds memsw_thresholds;
 	/* For oom notifier event fd */
 	struct list_head oom_notify;
 	/*
 	 * Should we move charges of a task when a task is moved into this
 	 * mem_cgroup ? And what type of charges should we move ?
 	 */
 	unsigned long move_charge_at_immigrate;
 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
 	 */
 	atomic_t	moving_account;
 	/* taken only while moving_account > 0 */
 	spinlock_t	move_lock;
 	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
 	/*
 	 * used when a cpu is offlined or other synchronizations
 	 * See mem_cgroup_read_stat().
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
 	/* analogous to slab_common's slab_caches list, but per-memcg;
 	 * protected by memcg_slab_mutex */
 	struct list_head memcg_slab_caches;
         /* Index in the kmem_cache->memcg_params->memcg_caches array */
 	int kmemcg_id;
 #endif
 	int last_scanned_node;
 #if MAX_NUMNODES > 1
 	nodemask_t	scan_nodes;
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 #endif
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
 	spinlock_t event_list_lock;
 	struct mem_cgroup_per_node *nodeinfo[0];
 	/* WARNING: nodeinfo must be the last member here */
 };
 /* internal only representation about the status of kmem accounting. */
 enum {
 	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
 	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
 #ifdef CONFIG_MEMCG_KMEM
 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 {
 	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
 	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
 	/*
 	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
 	 * will call css_put() if it sees the memcg is dead.
 	 */
 	smp_wmb();
 	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
 		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 }
 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
 {
 	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
 				  &memcg->kmem_account_flags);
 }
 #endif
 /* Stuffs for move charges at task migration. */
 /*
  * Types of charges to be moved. "move_charge_at_immitgrate" and
  * "immigrate_flags" are treated as a left-shifted bitmap of these types.
  */
 enum move_type {
 	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
 	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
 	NR_MOVE_TYPE,
 };
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
 	spinlock_t	  lock; /* for from, to */
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	unsigned long immigrate_flags;
 	unsigned long precharge;
 	unsigned long moved_charge;
 	unsigned long moved_swap;
 	struct task_struct *moving_task;	/* a task moving charges */
 	wait_queue_head_t waitq;		/* a waitq for other context */
 } mc = {
 	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 static bool move_anon(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
 }
 static bool move_file(void)
 {
 	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
 }
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
  */
 #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
 #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
 enum charge_type {
 	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 	MEM_CGROUP_CHARGE_TYPE_ANON,
 	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
 	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
 	NR_CHARGE_TYPE,
 };
 /* for encoding cft->private value on file */
 enum res_type {
 	_MEM,
 	_MEMSWAP,
 	_OOM_TYPE,
 	_KMEM,
 };
 #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL		(0)
 /*
  * The memcg_create_mutex will be held whenever a new cgroup is created.
  * As a consequence, any change that needs to protect against new child cgroups
  * appearing has to hold it as well.
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 {
 	return s ? container_of(s, struct mem_cgroup, css) : NULL;
 }
 /* Some nice accessors for the vmpressure. */
 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 {
 	if (!memcg)
 		memcg = root_mem_cgroup;
 	return &memcg->vmpressure;
 }
 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 {
 	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 }
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
 }
 /*
  * We restrict the id in the range of [1, 65535], so it can fit into
  * an unsigned short.
  */
 #define MEM_CGROUP_ID_MAX	USHRT_MAX
 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 {
 	return memcg->css.id;
 }
 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 {
 	struct cgroup_subsys_state *css;
 	css = css_from_id(id, &memory_cgrp_subsys);
 	return mem_cgroup_from_css(css);
 }
 /* Writing them here to avoid exposing memcg's inner layout */
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
 		struct cg_proto *cg_proto;
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 		/* Socket cloning can throw us here with sk_cgrp already
 		 * filled. It won't however, necessarily happen from
 		 * process context. So the test for root memcg given
 		 * the current task's memcg won't help us in this case.
 		 *
 		 * Respecting the original socket's memcg is a better
 		 * decision in this case.
 		 */
 		if (sk->sk_cgrp) {
 			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 			css_get(&sk->sk_cgrp->memcg->css);
 			return;
 		}
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
 		if (!mem_cgroup_is_root(memcg) &&
 		    memcg_proto_active(cg_proto) &&
 		    css_tryget_online(&memcg->css)) {
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
 }
 EXPORT_SYMBOL(sock_update_memcg);
 void sock_release_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 		struct mem_cgroup *memcg;
 		WARN_ON(!sk->sk_cgrp->memcg);
 		memcg = sk->sk_cgrp->memcg;
 		css_put(&sk->sk_cgrp->memcg->css);
 	}
 }
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg || mem_cgroup_is_root(memcg))
 		return NULL;
 	return &memcg->tcp_mem;
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 	if (!memcg_proto_activated(&memcg->tcp_mem))
 		return;
 	static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 #else
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
 }
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
  *  but only a few kmem-limited. Or also, if we have, for instance, 200
  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
  *  200 entry array for that.
  *
  * The current size of the caches array is stored in
  * memcg_limited_groups_array_size.  It will double each time we have to
  * increase it.
  */
 static DEFINE_IDA(kmem_limited_groups);
 int memcg_limited_groups_array_size;
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
  * the alloc/free process all the time. In a small machine, 4 kmem-limited
  * cgroups is a reasonable guess. In the future, it could be a parameter or
  * tunable, but that is strictly not necessary.
  *
  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
  * this constant directly from cgroup, but it is understandable that this is
  * better kept as an internal representation in cgroup.c. In any case, the
  * cgrp_id space is not getting any smaller, and we don't have to necessarily
  * increase ours as well if it increases.
  */
 #define MEMCG_CACHES_MIN_SIZE 4
 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 /*
  * A lot of the calls to the cache allocation functions are expected to be
  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
  * conditional to this static branch, we'll have to allow modules that does
  * kmem_cache_alloc and the such to see this symbol as well
  */
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 static void memcg_free_cache_id(int id);
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
 	if (memcg_kmem_is_active(memcg)) {
 		static_key_slow_dec(&memcg_kmem_enabled_key);
 		memcg_free_cache_id(memcg->kmemcg_id);
 	}
 	/*
 	 * This check can't live in kmem destruction function,
 	 * since the charges will outlive the cgroup
 	 */
 	WARN_ON(page_counter_read(&memcg->kmem));
 }
 #else
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
 static void disarm_static_keys(struct mem_cgroup *memcg)
 {
 	disarm_sock_keys(memcg);
 	disarm_kmem_keys(memcg);
 }
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 {
 	int nid = zone_to_nid(zone);
 	int zid = zone_idx(zone);
 	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 {
 	return &memcg->css;
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return &memcg->nodeinfo[nid]->zoneinfo[zid];
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_node_zone(int nid, int zid)
 {
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static struct mem_cgroup_tree_per_zone *
 soft_limit_tree_from_page(struct page *page)
 {
 	int nid = page_to_nid(page);
 	int zid = page_zonenum(page);
 	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 }
 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
 					 struct mem_cgroup_tree_per_zone *mctz,
 					 unsigned long new_usage_in_excess)
 {
 	struct rb_node **p = &mctz->rb_root.rb_node;
 	struct rb_node *parent = NULL;
 	struct mem_cgroup_per_zone *mz_node;
 	if (mz->on_tree)
 		return;
 	mz->usage_in_excess = new_usage_in_excess;
 	if (!mz->usage_in_excess)
 		return;
 	while (*p) {
 		parent = *p;
 		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 					tree_node);
 		if (mz->usage_in_excess < mz_node->usage_in_excess)
 			p = &(*p)->rb_left;
 		/*
 		 * We can't avoid mem cgroups that are over their soft
 		 * limit by the same amount
 		 */
 		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 			p = &(*p)->rb_right;
 	}
 	rb_link_node(&mz->tree_node, parent, p);
 	rb_insert_color(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = true;
 }
 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 					 struct mem_cgroup_tree_per_zone *mctz)
 {
 	if (!mz->on_tree)
 		return;
 	rb_erase(&mz->tree_node, &mctz->rb_root);
 	mz->on_tree = false;
 }
 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 				       struct mem_cgroup_tree_per_zone *mctz)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&mctz->lock, flags);
 	__mem_cgroup_remove_exceeded(mz, mctz);
 	spin_unlock_irqrestore(&mctz->lock, flags);
 }
 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 {
 	unsigned long nr_pages = page_counter_read(&memcg->memory);
 	unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
 	unsigned long excess = 0;
 	if (nr_pages > soft_limit)
 		excess = nr_pages - soft_limit;
 	return excess;
 }
 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
 	unsigned long excess;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
 	mctz = soft_limit_tree_from_page(page);
 	/*
 	 * Necessary to update all ancestors when hierarchy is used.
 	 * because their event counter is not touched.
 	 */
 	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 		mz = mem_cgroup_page_zoneinfo(memcg, page);
 		excess = soft_limit_excess(memcg);
 		/*
 		 * We have to update the tree if mz is on RB-tree or
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
 			unsigned long flags;
 			spin_lock_irqsave(&mctz->lock, flags);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(mz, mctz);
 			/*
 			 * Insert again. mz->usage_in_excess will be updated.
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(mz, mctz, excess);
 			spin_unlock_irqrestore(&mctz->lock, flags);
 		}
 	}
 }
 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_tree_per_zone *mctz;
 	struct mem_cgroup_per_zone *mz;
 	int nid, zid;
 	for_each_node(nid) {
 		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 			mctz = soft_limit_tree_node_zone(nid, zid);
 			mem_cgroup_remove_exceeded(mz, mctz);
 		}
 	}
 }
 static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct rb_node *rightmost = NULL;
 	struct mem_cgroup_per_zone *mz;
 retry:
 	mz = NULL;
 	rightmost = rb_last(&mctz->rb_root);
 	if (!rightmost)
 		goto done;		/* Nothing to reclaim from */
 	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 	/*
 	 * Remove the node now but someone else can add it back,
 	 * we will to add it back at the end of reclaim to its correct
 	 * position in the tree.
 	 */
 	__mem_cgroup_remove_exceeded(mz, mctz);
 	if (!soft_limit_excess(mz->memcg) ||
 	    !css_tryget_online(&mz->memcg->css))
 		goto retry;
 done:
 	return mz;
 }
 static struct mem_cgroup_per_zone *
 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 	spin_lock_irq(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
 	spin_unlock_irq(&mctz->lock);
 	return mz;
 }
 /*
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
  * a periodic synchronizion of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
  * have to visit all online cpus and make sum. So, for now, unnecessary
  * synchronization is not implemented. (just implemented for cpu hotplug)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
  * common workload, threashold and synchonization as vmstat[] should be
  * implemented.
  */
 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.count[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 					    enum mem_cgroup_events_index idx)
 {
 	unsigned long val = 0;
 	int cpu;
 	get_online_cpus();
 	for_each_online_cpu(cpu)
 		val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
 	spin_lock(&memcg->pcp_counter_lock);
 	val += memcg->nocpu_base.events[idx];
 	spin_unlock(&memcg->pcp_counter_lock);
 #endif
 	put_online_cpus();
 	return val;
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 struct page *page,
 					 int nr_pages)
 {
 	/*
 	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
 	if (PageAnon(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 				nr_pages);
 	else
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
 	if (PageTransHuge(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 				nr_pages);
 	/* pagein of a big page is an event. So, ignore page size */
 	if (nr_pages > 0)
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 	else {
 		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 		nr_pages = -nr_pages; /* for event */
 	}
 	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 }
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	struct mem_cgroup_per_zone *mz;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	return mz->lru_size[lru];
 }
 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 						  int nid,
 						  unsigned int lru_mask)
 {
 	unsigned long nr = 0;
 	int zid;
 	VM_BUG_ON((unsigned)nid >= nr_node_ids);
 	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 		struct mem_cgroup_per_zone *mz;
 		enum lru_list lru;
 		for_each_lru(lru) {
 			if (!(BIT(lru) & lru_mask))
 				continue;
 			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 			nr += mz->lru_size[lru];
 		}
 	}
 	return nr;
 }
 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 			unsigned int lru_mask)
 {
 	unsigned long nr = 0;
 	int nid;
 	for_each_node_state(nid, N_MEMORY)
 		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 	return nr;
 }
 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 				       enum mem_cgroup_events_target target)
 {
 	unsigned long val, next;
 	val = __this_cpu_read(memcg->stat->nr_page_events);
 	next = __this_cpu_read(memcg->stat->targets[target]);
 	/* from time_after() in jiffies.h */
 	if ((long)next - (long)val < 0) {
 		switch (target) {
 		case MEM_CGROUP_TARGET_THRESH:
 			next = val + THRESHOLDS_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_SOFTLIMIT:
 			next = val + SOFTLIMIT_EVENTS_TARGET;
 			break;
 		case MEM_CGROUP_TARGET_NUMAINFO:
 			next = val + NUMAINFO_EVENTS_TARGET;
 			break;
 		default:
 			break;
 		}
 		__this_cpu_write(memcg->stat->targets[target], next);
 		return true;
 	}
 	return false;
 }
 /*
  * Check events in order.
  *
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_THRESH))) {
 		bool do_softlimit;
 		bool do_numainfo __maybe_unused;
 		do_softlimit = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_SOFTLIMIT);
 #if MAX_NUMNODES > 1
 		do_numainfo = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_NUMAINFO);
 #endif
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 			mem_cgroup_update_tree(memcg, page);
 #if MAX_NUMNODES > 1
 		if (unlikely(do_numainfo))
 			atomic_inc(&memcg->numainfo_events);
 #endif
 	}
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
 	/*
 	 * mm_update_next_owner() may clear mm->owner to NULL
 	 * if it races with swapoff, page migration, etc.
 	 * So this can be called with p == NULL.
 	 */
 	if (unlikely(!p))
 		return NULL;
 	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 }
 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *memcg = NULL;
 	rcu_read_lock();
 	do {
 		/*
 		 * Page cache insertions can happen withou an
 		 * actual mm context, e.g. during disk probing
 		 * on boot, loopback IO, acct() writes etc.
 		 */
 		if (unlikely(!mm))
 			memcg = root_mem_cgroup;
 		else {
 			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 			if (unlikely(!memcg))
 				memcg = root_mem_cgroup;
 		}
 	} while (!css_tryget_online(&memcg->css));
 	rcu_read_unlock();
 	return memcg;
 }
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
  * @prev: previously returned memcg, NULL on first invocation
  * @reclaim: cookie for shared reclaim walks, NULL for full walks
  *
  * Returns references to children of the hierarchy below @root, or
  * @root itself, or %NULL after a full round-trip.
  *
  * Caller must pass the return value in @prev on subsequent
  * invocations for reference counting, or use mem_cgroup_iter_break()
  * to cancel a hierarchy walk before the round-trip is complete.
  *
  * Reclaimers can specify a zone and a priority level in @reclaim to
  * divide up the memcgs in the hierarchy among all concurrent
  * reclaimers operating on the same zone and priority.
  */
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 				   struct mem_cgroup *prev,
 				   struct mem_cgroup_reclaim_cookie *reclaim)
 {
 	struct reclaim_iter *uninitialized_var(iter);
 	struct cgroup_subsys_state *css = NULL;
 	struct mem_cgroup *memcg = NULL;
 	struct mem_cgroup *pos = NULL;
 	if (mem_cgroup_disabled())
 		return NULL;
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && !reclaim)
 		pos = prev;
 	if (!root->use_hierarchy && root != root_mem_cgroup) {
 		if (prev)
 			goto out;
 		return root;
 	}
 	rcu_read_lock();
 	if (reclaim) {
 		struct mem_cgroup_per_zone *mz;
 		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
 		iter = &mz->iter[reclaim->priority];
 		if (prev && reclaim->generation != iter->generation)
 			goto out_unlock;
 		do {
 			pos = ACCESS_ONCE(iter->position);
 			/*
 			 * A racing update may change the position and
 			 * put the last reference, hence css_tryget(),
 			 * or retry to see the updated position.
 			 */
 		} while (pos && !css_tryget(&pos->css));
 	}
 	if (pos)
 		css = &pos->css;
 	for (;;) {
 		css = css_next_descendant_pre(css, &root->css);
 		if (!css) {
 			/*
 			 * Reclaimers share the hierarchy walk, and a
 			 * new one might jump in right at the end of
 			 * the hierarchy - make sure they see at least
 			 * one group and restart from the beginning.
 			 */
 			if (!prev)
 				continue;
 			break;
 		}
 		/*
 		 * Verify the css and acquire a reference.  The root
 		 * is provided by the caller, so we know it's alive
 		 * and kicking, and don't take an extra reference.
 		 */
 		memcg = mem_cgroup_from_css(css);
 		if (css == &root->css)
 			break;
 		if (css_tryget_online(css)) {
 			/*
 			 * Make sure the memcg is initialized:
 			 * mem_cgroup_css_online() orders the the
 			 * initialization against setting the flag.
 			 */
 			if (smp_load_acquire(&memcg->initialized))
 				break;
 			css_put(css);
 		}
 		memcg = NULL;
 	}
 	if (reclaim) {
 		if (cmpxchg(&iter->position, pos, memcg) == pos) {
 			if (memcg)
 				css_get(&memcg->css);
 			if (pos)
 				css_put(&pos->css);
 		}
 		/*
 		 * pairs with css_tryget when dereferencing iter->position
 		 * above.
 		 */
 		if (pos)
 			css_put(&pos->css);
 		if (!memcg)
 			iter->generation++;
 		else if (!prev)
 			reclaim->generation = iter->generation;
 	}
 out_unlock:
 	rcu_read_unlock();
 out:
 	if (prev && prev != root)
 		css_put(&prev->css);
 	return memcg;
 }
 /**
  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
  * @root: hierarchy root
  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
  */
 void mem_cgroup_iter_break(struct mem_cgroup *root,
 			   struct mem_cgroup *prev)
 {
 	if (!root)
 		root = root_mem_cgroup;
 	if (prev && prev != root)
 		css_put(&prev->css);
 }
 /*
  * Iteration constructs for visiting all cgroups (under a tree).  If
  * loops are exited prematurely (break), mem_cgroup_iter_break() must
  * be used for reference counting.
  */
 #define for_each_mem_cgroup_tree(iter, root)		\
 	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(root, iter, NULL))
 #define for_each_mem_cgroup(iter)			\
 	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 	if (unlikely(!memcg))
 		goto out;
 	switch (idx) {
 	case PGFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
 		break;
 	case PGMAJFAULT:
 		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
 		break;
 	default:
 		BUG();
 	}
 out:
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
 /**
  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
  * @zone: zone of the wanted lruvec
  * @memcg: memcg of the wanted lruvec
  *
  * Returns the lru list vector holding pages for the given @zone and
  * @mem.  This can be the global zone lruvec, if the memory controller
  * is disabled.
  */
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 				      struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct lruvec *lruvec;
 	if (mem_cgroup_disabled()) {
 		lruvec = &zone->lruvec;
 		goto out;
 	}
 	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
 	lruvec = &mz->lruvec;
 out:
 	/*
 	 * Since a node can be onlined after the mem_cgroup was created,
 	 * we have to be prepared to initialize lruvec->zone here;
 	 * and if offlined then reonlined, we need to reinitialize it.
 	 */
 	if (unlikely(lruvec->zone != zone))
 		lruvec->zone = zone;
 	return lruvec;
 }
 /**
  * mem_cgroup_page_lruvec - return lruvec for adding an lru page
  * @page: the page
  * @zone: zone of the page
  */
 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
 {
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	struct lruvec *lruvec;
 	if (mem_cgroup_disabled()) {
 		lruvec = &zone->lruvec;
 		goto out;
 	}
 	pc = lookup_page_cgroup(page);
 	memcg = pc->mem_cgroup;
 	/*
 	 * Surreptitiously switch any uncharged offlist page to root:
 	 * an uncharged page off lru does nothing to secure
 	 * its former mem_cgroup from sudden removal.
 	 *
 	 * Our caller holds lru_lock, and PageCgroupUsed is updated
 	 * under page_cgroup lock: between them, they make all uses
 	 * of pc->mem_cgroup safe.
 	 */
 	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
 		pc->mem_cgroup = memcg = root_mem_cgroup;
 	mz = mem_cgroup_page_zoneinfo(memcg, page);
 	lruvec = &mz->lruvec;
 out:
 	/*
 	 * Since a node can be onlined after the mem_cgroup was created,
 	 * we have to be prepared to initialize lruvec->zone here;
 	 * and if offlined then reonlined, we need to reinitialize it.
 	 */
 	if (unlikely(lruvec->zone != zone))
 		lruvec->zone = zone;
 	return lruvec;
 }
 /**
  * mem_cgroup_update_lru_size - account for adding or removing an lru page
  * @lruvec: mem_cgroup per zone lru vector
  * @lru: index of lru list the page is sitting on
  * @nr_pages: positive when adding or negative when removing
  *
  * This function must be called when a page is added to or removed from an
  * lru list.
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
 	if (mem_cgroup_disabled())
 		return;
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	lru_size = mz->lru_size + lru;
 	*lru_size += nr_pages;
 	VM_BUG_ON((long)(*lru_size) < 0);
 }
 /*
  * Checks whether given mem is same or in the root_mem_cgroup's
  * hierarchy subtree
  */
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg)
 {
 	if (root_memcg == memcg)
 		return true;
 	if (!root_memcg->use_hierarchy || !memcg)
 		return false;
 	return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
 }
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				       struct mem_cgroup *memcg)
 {
 	bool ret;
 	rcu_read_lock();
 	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
 	rcu_read_unlock();
 	return ret;
 }
 bool task_in_mem_cgroup(struct task_struct *task,
 			const struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *curr = NULL;
 	struct task_struct *p;
 	bool ret;
 	p = find_lock_task_mm(task);
 	if (p) {
 		curr = get_mem_cgroup_from_mm(p->mm);
 		task_unlock(p);
 	} else {
 		/*
 		 * All threads may have already detached their mm's, but the oom
 		 * killer still needs to detect if they have already been oom
 		 * killed to prevent needlessly killing additional tasks.
 		 */
 		rcu_read_lock();
 		curr = mem_cgroup_from_task(task);
 		if (curr)
 			css_get(&curr->css);
 		rcu_read_unlock();
 	}
 	/*
 	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
 	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
 	 * hierarchy(even if use_hierarchy is disabled in "memcg").
 	 */
 	ret = mem_cgroup_same_or_subtree(memcg, curr);
 	css_put(&curr->css);
 	return ret;
 }
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
 	unsigned long gb;
 	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
 	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
 	gb = (inactive + active) >> (30 - PAGE_SHIFT);
 	if (gb)
 		inactive_ratio = int_sqrt(10 * gb);
 	else
 		inactive_ratio = 1;
 	return inactive * inactive_ratio < active;
 }
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 /**
  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
  * @memcg: the memory cgroup
  *
  * Returns the maximum amount of memory @mem can be charged with, in
  * pages.
  */
 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 {
 	unsigned long margin = 0;
 	unsigned long count;
 	unsigned long limit;
 	count = page_counter_read(&memcg->memory);
 	limit = ACCESS_ONCE(memcg->memory.limit);
 	if (count < limit)
 		margin = limit - count;
 	if (do_swap_account) {
 		count = page_counter_read(&memcg->memsw);
 		limit = ACCESS_ONCE(memcg->memsw.limit);
 		if (count <= limit)
 			margin = min(margin, limit - count);
 	}
 	return margin;
 }
 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
 {
 	/* root ? */
 	if (mem_cgroup_disabled() || !memcg->css.parent)
 		return vm_swappiness;
 	return memcg->swappiness;
 }
 /*
  * memcg->moving_account is used for checking possibility that some thread is
  * calling move_account(). When a thread on CPU-A starts moving pages under
  * a memcg, other threads should check memcg->moving_account under
  * rcu_read_lock(), like this:
  *
  *         CPU-A                                    CPU-B
  *                                              rcu_read_lock()
  *         memcg->moving_account+1              if (memcg->mocing_account)
  *                                                   take heavy locks.
  *         synchronize_rcu()                    update something.
  *                                              rcu_read_unlock()
  *         start move here.
  */
 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg->moving_account);
 	synchronize_rcu();
 }
 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
 	/*
 	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
 	 * We check NULL in callee rather than caller.
 	 */
 	if (memcg)
 		atomic_dec(&memcg->moving_account);
 }
 /*
  * A routine for checking "mem" is under move_account() or not.
  *
  * Checking a cgroup is mc.from or mc.to or under hierarchy of
  * moving cgroups. This is for waiting at high-memory pressure
  * caused by "move".
  */
 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *from;
 	struct mem_cgroup *to;
 	bool ret = false;
 	/*
 	 * Unlike task_move routines, we access mc.to, mc.from not under
 	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
 	 */
 	spin_lock(&mc.lock);
 	from = mc.from;
 	to = mc.to;
 	if (!from)
 		goto unlock;
 	ret = mem_cgroup_same_or_subtree(memcg, from)
 		|| mem_cgroup_same_or_subtree(memcg, to);
 unlock:
 	spin_unlock(&mc.lock);
 	return ret;
 }
 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 {
 	if (mc.moving_task && current != mc.moving_task) {
 		if (mem_cgroup_under_move(memcg)) {
 			DEFINE_WAIT(wait);
 			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
 			/* moving charge context might have finished. */
 			if (mc.moving_task)
 				schedule();
 			finish_wait(&mc.waitq, &wait);
 			return true;
 		}
 	}
 	return false;
 }
 /*
  * Take this lock when
  * - a code tries to modify page's memcg while it's USED.
  * - a code tries to modify page state accounting in a memcg.
  */
 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
 				  unsigned long *flags)
 {
 	spin_lock_irqsave(&memcg->move_lock, *flags);
 }
 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 				unsigned long *flags)
 {
 	spin_unlock_irqrestore(&memcg->move_lock, *flags);
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /**
  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
  * @memcg: The memory cgroup that went over limit
  * @p: Task that is going to be killed
  *
  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
  * enabled
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
 	/* oom_info_lock ensures that parallel ooms do not interleave */
 	static DEFINE_MUTEX(oom_info_lock);
 	struct mem_cgroup *iter;
 	unsigned int i;
 	if (!p)
 		return;
 	mutex_lock(&oom_info_lock);
 	rcu_read_lock();
 	pr_info("Task in ");
 	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
 	pr_info(" killed as a result of limit of ");
 	pr_cont_cgroup_path(memcg->css.cgroup);
 	pr_info("\n");
 	rcu_read_unlock();
 	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
 		K((u64)page_counter_read(&memcg->memory)),
 		K((u64)memcg->memory.limit), memcg->memory.failcnt);
 	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
 		K((u64)page_counter_read(&memcg->memsw)),
 		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
 	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
 		K((u64)page_counter_read(&memcg->kmem)),
 		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
 	for_each_mem_cgroup_tree(iter, memcg) {
 		pr_info("Memory cgroup stats for ");
 		pr_cont_cgroup_path(iter->css.cgroup);
 		pr_cont(":");
 		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 				continue;
 			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
 				K(mem_cgroup_read_stat(iter, i)));
 		}
 		for (i = 0; i < NR_LRU_LISTS; i++)
 			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
 				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
 		pr_cont("\n");
 	}
 	mutex_unlock(&oom_info_lock);
 }
 /*
  * This function returns the number of memcg under hierarchy tree. Returns
  * 1(self count) if no children.
  */
 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 {
 	int num = 0;
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		num++;
 	return num;
 }
 /*
  * Return the memory (and swap, if configured) limit for a memcg.
  */
 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
 	unsigned long limit;
 	limit = memcg->memory.limit;
 	if (mem_cgroup_swappiness(memcg)) {
 		unsigned long memsw_limit;
 		memsw_limit = memcg->memsw.limit;
 		limit = min(limit + total_swap_pages, memsw_limit);
 	}
 	return limit;
 }
 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
 	struct mem_cgroup *iter;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
 	unsigned int points = 0;
 	struct task_struct *chosen = NULL;
 	/*
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
 		set_thread_flag(TIF_MEMDIE);
 		return;
 	}
 	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
 	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
 	for_each_mem_cgroup_tree(iter, memcg) {
 		struct css_task_iter it;
 		struct task_struct *task;
 		css_task_iter_start(&iter->css, &it);
 		while ((task = css_task_iter_next(&it))) {
 			switch (oom_scan_process_thread(task, totalpages, NULL,
 							false)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = ULONG_MAX;
 				get_task_struct(chosen);
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
 				continue;
 			case OOM_SCAN_ABORT:
 				css_task_iter_end(&it);
 				mem_cgroup_iter_break(memcg, iter);
 				if (chosen)
 					put_task_struct(chosen);
 				return;
 			case OOM_SCAN_OK:
 				break;
 			};
 			points = oom_badness(task, memcg, NULL, totalpages);
 			if (!points || points < chosen_points)
 				continue;
 			/* Prefer thread group leaders for display purposes */
 			if (points == chosen_points &&
 			    thread_group_leader(chosen))
 				continue;
 			if (chosen)
 				put_task_struct(chosen);
 			chosen = task;
 			chosen_points = points;
 			get_task_struct(chosen);
 		}
 		css_task_iter_end(&it);
 	}
 	if (!chosen)
 		return;
 	points = chosen_points * 1000 / totalpages;
 	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
 }
 /**
  * test_mem_cgroup_node_reclaimable
  * @memcg: the target memcg
  * @nid: the node ID to be checked.
  * @noswap : specify true here if the user wants flle only information.
  *
  * This function returns whether the specified memcg contains any
  * reclaimable pages on a node. Returns true if there are any reclaimable
  * pages in the node.
  */
 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
 		int nid, bool noswap)
 {
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
 		return true;
 	if (noswap || !total_swap_pages)
 		return false;
 	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
 		return true;
 	return false;
 }
 #if MAX_NUMNODES > 1
 /*
  * Always updating the nodemask is not very good - even if we have an empty
  * list or the wrong list here, we can start from some node and traverse all
  * nodes based on the zonelist. So update the list loosely once per 10 secs.
  *
  */
 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
 {
 	int nid;
 	/*
 	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
 	 * pagein/pageout changes since the last update.
 	 */
 	if (!atomic_read(&memcg->numainfo_events))
 		return;
 	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
 		return;
 	/* make a nodemask where this memcg uses memory from */
 	memcg->scan_nodes = node_states[N_MEMORY];
 	for_each_node_mask(nid, node_states[N_MEMORY]) {
 		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
 			node_clear(nid, memcg->scan_nodes);
 	}
 	atomic_set(&memcg->numainfo_events, 0);
 	atomic_set(&memcg->numainfo_updating, 0);
 }
 /*
  * Selecting a node where we start reclaim from. Because what we need is just
  * reducing usage counter, start from anywhere is O,K. Considering
  * memory reclaim from current node, there are pros. and cons.
  *
  * Freeing memory from current node means freeing memory from a node which
  * we'll use or we've used. So, it may make LRU bad. And if several threads
  * hit limits, it will see a contention on a node. But freeing from remote
  * node means more costs for memory reclaim because of memory latency.
  *
  * Now, we use round-robin. Better algorithm is welcomed.
  */
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 	node = next_node(node, memcg->scan_nodes);
 	if (node == MAX_NUMNODES)
 		node = first_node(memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
 	 * memcg is too small and all pages are not on LRU. In that case,
 	 * we use curret node.
 	 */
 	if (unlikely(node == MAX_NUMNODES))
 		node = numa_node_id();
 	memcg->last_scanned_node = node;
 	return node;
 }
 /*
  * Check all nodes whether it contains reclaimable pages or not.
  * For quick scan, we make use of scan_nodes. This will allow us to skip
  * unused nodes. But scan_nodes is lazily updated and may not cotain
  * enough new information. We need to do double check.
  */
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	int nid;
 	/*
 	 * quick check...making use of scan_node.
 	 * We can skip unused nodes.
 	 */
 	if (!nodes_empty(memcg->scan_nodes)) {
 		for (nid = first_node(memcg->scan_nodes);
 		     nid < MAX_NUMNODES;
 		     nid = next_node(nid, memcg->scan_nodes)) {
 			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 				return true;
 		}
 	}
 	/*
 	 * Check rest of nodes.
 	 */
 	for_each_node_state(nid, N_MEMORY) {
 		if (node_isset(nid, memcg->scan_nodes))
 			continue;
 		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
 			return true;
 	}
 	return false;
 }
 #else
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
 	return 0;
 }
 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
 	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
 #endif
 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 				   struct zone *zone,
 				   gfp_t gfp_mask,
 				   unsigned long *total_scanned)
 {
 	struct mem_cgroup *victim = NULL;
 	int total = 0;
 	int loop = 0;
 	unsigned long excess;
 	unsigned long nr_scanned;
 	struct mem_cgroup_reclaim_cookie reclaim = {
 		.zone = zone,
 		.priority = 0,
 	};
 	excess = soft_limit_excess(root_memcg);
 	while (1) {
 		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
 		if (!victim) {
 			loop++;
 			if (loop >= 2) {
 				/*
 				 * If we have not been able to reclaim
 				 * anything, it might because there are
 				 * no reclaimable pages under this hierarchy
 				 */
 				if (!total)
 					break;
 				/*
 				 * We want to do more targeted reclaim.
 				 * excess >> 2 is not to excessive so as to
 				 * reclaim too much, nor too less that we keep
 				 * coming back to reclaim from this cgroup
 				 */
 				if (total >= (excess >> 2) ||
 					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
 					break;
 			}
 			continue;
 		}
 		if (!mem_cgroup_reclaimable(victim, false))
 			continue;
 		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
 						     zone, &nr_scanned);
 		*total_scanned += nr_scanned;
 		if (!soft_limit_excess(root_memcg))
 			break;
 	}
 	mem_cgroup_iter_break(root_memcg, victim);
 	return total;
 }
 #ifdef CONFIG_LOCKDEP
 static struct lockdep_map memcg_oom_lock_dep_map = {
 	.name = "memcg_oom_lock",
 };
 #endif
 static DEFINE_SPINLOCK(memcg_oom_lock);
 /*
  * Check OOM-Killer is already running under our hierarchy.
  * If someone is running, return false.
  */
 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter, *failed = NULL;
 	spin_lock(&memcg_oom_lock);
 	for_each_mem_cgroup_tree(iter, memcg) {
 		if (iter->oom_lock) {
 			/*
 			 * this subtree of our hierarchy is already locked
 			 * so we cannot give a lock.
 			 */
 			failed = iter;
 			mem_cgroup_iter_break(memcg, iter);
 			break;
 		} else
 			iter->oom_lock = true;
 	}
 	if (failed) {
 		/*
 		 * OK, we failed to lock the whole subtree so we have
 		 * to clean up what we set up to the failing subtree
 		 */
 		for_each_mem_cgroup_tree(iter, memcg) {
 			if (iter == failed) {
 				mem_cgroup_iter_break(memcg, iter);
 				break;
 			}
 			iter->oom_lock = false;
 		}
 	} else
 		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
 	spin_unlock(&memcg_oom_lock);
 	return !failed;
 }
 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	spin_lock(&memcg_oom_lock);
 	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
 	for_each_mem_cgroup_tree(iter, memcg)
 		iter->oom_lock = false;
 	spin_unlock(&memcg_oom_lock);
 }
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_inc(&iter->under_oom);
 }
 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	/*
 	 * When a new child is created while the hierarchy is under oom,
 	 * mem_cgroup_oom_lock() may not be called. We have to use
 	 * atomic_add_unless() here.
 	 */
 	for_each_mem_cgroup_tree(iter, memcg)
 		atomic_add_unless(&iter->under_oom, -1, 0);
 }
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
 struct oom_wait_info {
 	struct mem_cgroup *memcg;
 	wait_queue_t	wait;
 };
 static int memcg_oom_wake_function(wait_queue_t *wait,
 	unsigned mode, int sync, void *arg)
 {
 	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
 	struct mem_cgroup *oom_wait_memcg;
 	struct oom_wait_info *oom_wait_info;
 	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
 	oom_wait_memcg = oom_wait_info->memcg;
 	/*
 	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
 	 * Then we can use css_is_ancestor without taking care of RCU.
 	 */
 	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
 		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, arg);
 }
 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
 	atomic_inc(&memcg->oom_wakeups);
 	/* for filtering, pass "memcg" as argument. */
 	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
 static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
 	if (memcg && atomic_read(&memcg->under_oom))
 		memcg_wakeup_oom(memcg);
 }
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
 	if (!current->memcg_oom.may_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
 	 * don't want to block when potentially sitting on a callstack
 	 * that holds all kinds of filesystem and mm locks.
 	 *
 	 * Also, the caller may handle a failed allocation gracefully
 	 * (like optional page cache readahead) and so an OOM killer
 	 * invocation might not even be necessary.
 	 *
 	 * That's why we don't do anything here except remember the
 	 * OOM context and then deal with it at the end of the page
 	 * fault when the stack is unwound, the locks are released,
 	 * and when we know whether the fault was overall successful.
 	 */
 	css_get(&memcg->css);
 	current->memcg_oom.memcg = memcg;
 	current->memcg_oom.gfp_mask = mask;
 	current->memcg_oom.order = order;
 }
 /**
  * mem_cgroup_oom_synchronize - complete memcg OOM handling
  * @handle: actually kill/wait or just clean up the OOM state
  *
  * This has to be called at the end of a page fault if the memcg OOM
  * handler was enabled.
  *
  * Memcg supports userspace OOM handling where failed allocations must
  * sleep on a waitqueue until the userspace task resolves the
  * situation.  Sleeping directly in the charge context with all kinds
  * of locks held is not a good idea, instead we remember an OOM state
  * in the task and mem_cgroup_oom_synchronize() has to be called at
  * the end of the page fault to complete the OOM handling.
  *
  * Returns %true if an ongoing memcg OOM situation was detected and
  * completed, %false otherwise.
  */
 bool mem_cgroup_oom_synchronize(bool handle)
 {
 	struct mem_cgroup *memcg = current->memcg_oom.memcg;
 	struct oom_wait_info owait;
 	bool locked;
 	/* OOM is global, do not handle */
 	if (!memcg)
 		return false;
 	if (!handle)
 		goto cleanup;
 	owait.memcg = memcg;
 	owait.wait.flags = 0;
 	owait.wait.func = memcg_oom_wake_function;
 	owait.wait.private = current;
 	INIT_LIST_HEAD(&owait.wait.task_list);
 	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	mem_cgroup_mark_under_oom(memcg);
 	locked = mem_cgroup_oom_trylock(memcg);
 	if (locked)
 		mem_cgroup_oom_notify(memcg);
 	if (locked && !memcg->oom_kill_disable) {
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
 					 current->memcg_oom.order);
 	} else {
 		schedule();
 		mem_cgroup_unmark_under_oom(memcg);
 		finish_wait(&memcg_oom_waitq, &owait.wait);
 	}
 	if (locked) {
 		mem_cgroup_oom_unlock(memcg);
 		/*
 		 * There is no guarantee that an OOM-lock contender
 		 * sees the wakeups triggered by the OOM kill
 		 * uncharges.  Wake any sleepers explicitely.
 		 */
 		memcg_oom_recover(memcg);
 	}
 cleanup:
 	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
 	return true;
 }
 /**
  * mem_cgroup_begin_page_stat - begin a page state statistics transaction
  * @page: page that is going to change accounted state
  * @locked: &memcg->move_lock slowpath was taken
  * @flags: IRQ-state flags for &memcg->move_lock
  *
  * This function must mark the beginning of an accounted page state
  * change to prevent double accounting when the page is concurrently
  * being moved to another memcg:
  *
  *   memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
  *   if (TestClearPageState(page))
  *     mem_cgroup_update_page_stat(memcg, state, -1);
  *   mem_cgroup_end_page_stat(memcg, locked, flags);
  *
  * The RCU lock is held throughout the transaction.  The fast path can
  * get away without acquiring the memcg->move_lock (@locked is false)
  * because page moving starts with an RCU grace period.
  *
  * The RCU lock also protects the memcg from being freed when the page
  * state that is going to change is the only thing preventing the page
  * from being uncharged.  E.g. end-writeback clearing PageWriteback(),
  * which allows migration to go ahead and uncharge the page before the
  * account transaction might be complete.
  */
 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
 					      bool *locked,
 					      unsigned long *flags)
 {
 	struct mem_cgroup *memcg;
 	struct page_cgroup *pc;
 	rcu_read_lock();
 	if (mem_cgroup_disabled())
 		return NULL;
 	pc = lookup_page_cgroup(page);
 again:
 	memcg = pc->mem_cgroup;
 	if (unlikely(!memcg || !PageCgroupUsed(pc)))
 		return NULL;
 	*locked = false;
 	if (atomic_read(&memcg->moving_account) <= 0)
 		return memcg;
 	move_lock_mem_cgroup(memcg, flags);
 	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
 		move_unlock_mem_cgroup(memcg, flags);
 		goto again;
 	}
 	*locked = true;
 	return memcg;
 }
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
  * @memcg: the memcg that was accounted against
  * @locked: value received from mem_cgroup_begin_page_stat()
  * @flags: value received from mem_cgroup_begin_page_stat()
  */
 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
 			      unsigned long flags)
 {
 	if (memcg && locked)
 		move_unlock_mem_cgroup(memcg, &flags);
 	rcu_read_unlock();
 }
 /**
  * mem_cgroup_update_page_stat - update page state statistics
  * @memcg: memcg to account against
  * @idx: page state item to account
  * @val: number of pages (positive or negative)
  *
  * See mem_cgroup_begin_page_stat() for locking requirements.
  */
 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
 				 enum mem_cgroup_stat_index idx, int val)
 {
 	VM_BUG_ON(!rcu_read_lock_held());
 	if (memcg)
 		this_cpu_add(memcg->stat->count[idx], val);
 }
 /*
  * size of first charge trial. "32" comes from vmscan.c's magic value.
  * TODO: maybe necessary to use big numbers in big irons.
  */
 #define CHARGE_BATCH	32U
 struct memcg_stock_pcp {
 	struct mem_cgroup *cached; /* this never be root cgroup */
 	unsigned int nr_pages;
 	struct work_struct work;
 	unsigned long flags;
 #define FLUSHING_CACHED_CHARGE	0
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
 /**
  * consume_stock: Try to consume stocked charge on this cpu.
  * @memcg: memcg to consume from.
  * @nr_pages: how many pages to charge.
  *
  * The charges will only happen if @memcg matches the current cpu's memcg
  * stock, and at least @nr_pages are available in that stock.  Failure to
  * service an allocation will refill the stock.
  *
  * returns true if successful, false otherwise.
  */
 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock;
 	bool ret = false;
 	if (nr_pages > CHARGE_BATCH)
 		return ret;
 	stock = &get_cpu_var(memcg_stock);
 	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
 		stock->nr_pages -= nr_pages;
 		ret = true;
 	}
 	put_cpu_var(memcg_stock);
 	return ret;
 }
 /*
  * Returns stocks cached in percpu and reset cached information.
  */
 static void drain_stock(struct memcg_stock_pcp *stock)
 {
 	struct mem_cgroup *old = stock->cached;
 	if (stock->nr_pages) {
 		page_counter_uncharge(&old->memory, stock->nr_pages);
 		if (do_swap_account)
 			page_counter_uncharge(&old->memsw, stock->nr_pages);
+		css_put_many(&old->css, stock->nr_pages);
 		stock->nr_pages = 0;
 	}
 	stock->cached = NULL;
 }
 /*
  * This must be called under preempt disabled or must be called by
  * a thread which is pinned to local cpu.
  */
 static void drain_local_stock(struct work_struct *dummy)
 {
 	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
 	drain_stock(stock);
 	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 static void __init memcg_stock_init(void)
 {
 	int cpu;
 	for_each_possible_cpu(cpu) {
 		struct memcg_stock_pcp *stock =
 					&per_cpu(memcg_stock, cpu);
 		INIT_WORK(&stock->work, drain_local_stock);
 	}
 }
 /*
  * Cache charges(val) to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
  */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 	if (stock->cached != memcg) { /* reset if necessary */
 		drain_stock(stock);
 		stock->cached = memcg;
 	}
 	stock->nr_pages += nr_pages;
 	put_cpu_var(memcg_stock);
 }
 /*
  * Drains all per-CPU charge caches for given root_memcg resp. subtree
  * of the hierarchy under it. sync flag says whether we should block
  * until the work is done.
  */
 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 {
 	int cpu, curcpu;
 	/* Notify other cpus that system-wide "drain" is running */
 	get_online_cpus();
 	curcpu = get_cpu();
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		struct mem_cgroup *memcg;
 		memcg = stock->cached;
 		if (!memcg || !stock->nr_pages)
 			continue;
 		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
 			continue;
 		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
 			if (cpu == curcpu)
 				drain_local_stock(&stock->work);
 			else
 				schedule_work_on(cpu, &stock->work);
 		}
 	}
 	put_cpu();
 	if (!sync)
 		goto out;
 	for_each_online_cpu(cpu) {
 		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
 		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
 			flush_work(&stock->work);
 	}
 out:
 	put_online_cpus();
 }
 /*
  * Tries to drain stocked charges in other cpus. This function is asynchronous
  * and just put a work per cpu for draining localy on each cpu. Caller can
  * expects some charges will be back later but cannot wait for it.
  */
 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
 {
 	/*
 	 * If someone calls draining, avoid adding more kworker runs.
 	 */
 	if (!mutex_trylock(&percpu_charge_mutex))
 		return;
 	drain_all_stock(root_memcg, false);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /* This is a synchronous drain interface. */
 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 {
 	/* called when force_empty is called */
 	mutex_lock(&percpu_charge_mutex);
 	drain_all_stock(root_memcg, true);
 	mutex_unlock(&percpu_charge_mutex);
 }
 /*
  * This function drains percpu counter value from DEAD cpu and
  * move it to local cpu. Note that this function can be preempted.
  */
 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 {
 	int i;
 	spin_lock(&memcg->pcp_counter_lock);
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long x = per_cpu(memcg->stat->count[i], cpu);
 		per_cpu(memcg->stat->count[i], cpu) = 0;
 		memcg->nocpu_base.count[i] += x;
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
 		per_cpu(memcg->stat->events[i], cpu) = 0;
 		memcg->nocpu_base.events[i] += x;
 	}
 	spin_unlock(&memcg->pcp_counter_lock);
 }
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
 	struct mem_cgroup *iter;
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 	for_each_mem_cgroup(iter)
 		mem_cgroup_drain_pcp_counter(iter, cpu);
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 		      unsigned int nr_pages)
 {
 	unsigned int batch = max(CHARGE_BATCH, nr_pages);
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
 	unsigned long nr_reclaimed;
 	bool may_swap = true;
 	bool drained = false;
 	int ret = 0;
 	if (mem_cgroup_is_root(memcg))
 		goto done;
 retry:
 	if (consume_stock(memcg, nr_pages))
 		goto done;
 	if (!do_swap_account ||
 	    !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
 		if (!page_counter_try_charge(&memcg->memory, batch, &counter))
 			goto done_restock;
 		if (do_swap_account)
 			page_counter_uncharge(&memcg->memsw, batch);
 		mem_over_limit = mem_cgroup_from_counter(counter, memory);
 	} else {
 		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
 		may_swap = false;
 	}
 	if (batch > nr_pages) {
 		batch = nr_pages;
 		goto retry;
 	}
 	/*
 	 * Unlike in global OOM situations, memcg is not in a physical
 	 * memory shortage.  Allow dying and OOM-killed tasks to
 	 * bypass the last charges so that they can exit quickly and
 	 * free their memory.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
 		     fatal_signal_pending(current) ||
 		     current->flags & PF_EXITING))
 		goto bypass;
 	if (unlikely(task_in_memcg_oom(current)))
 		goto nomem;
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nomem;
 	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
 						    gfp_mask, may_swap);
 	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
 		goto retry;
 	if (!drained) {
 		drain_all_stock_async(mem_over_limit);
 		drained = true;
 		goto retry;
 	}
 	if (gfp_mask & __GFP_NORETRY)
 		goto nomem;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
 	 * before killing the task.
 	 *
 	 * Only for regular pages, though: huge pages are rather
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
 	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
 		goto retry;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
 		goto retry;
 	if (nr_retries--)
 		goto retry;
 	if (gfp_mask & __GFP_NOFAIL)
 		goto bypass;
 	if (fatal_signal_pending(current))
 		goto bypass;
 	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
 nomem:
 	if (!(gfp_mask & __GFP_NOFAIL))
 		return -ENOMEM;
 bypass:
 	return -EINTR;
 done_restock:
+	css_get_many(&memcg->css, batch);
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
 done:
 	return ret;
 }
 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	if (mem_cgroup_is_root(memcg))
 		return;
 	page_counter_uncharge(&memcg->memory, nr_pages);
 	if (do_swap_account)
 		page_counter_uncharge(&memcg->memsw, nr_pages);
+	css_put_many(&memcg->css, nr_pages);
 }
 /*
  * A helper function to get mem_cgroup from ID. must be called under
  * rcu_read_lock().  The caller is responsible for calling
  * css_tryget_online() if the mem_cgroup is used for charging. (dropping
  * refcnt from swap can be called against removed memcg.)
  */
 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 {
 	/* ID 0 is unused ID */
 	if (!id)
 		return NULL;
 	return mem_cgroup_from_id(id);
 }
 /*
  * try_get_mem_cgroup_from_page - look up page's memcg association
  * @page: the page
  *
  * Look up, get a css reference, and return the memcg that owns @page.
  *
  * The page must be locked to prevent racing with swap-in and page
  * cache charges.  If coming from an unlocked page table, the caller
  * must ensure the page is on the LRU or this can race with charging.
  */
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	unsigned short id;
 	swp_entry_t ent;
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	pc = lookup_page_cgroup(page);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		if (memcg && !css_tryget_online(&memcg->css))
 			memcg = NULL;
 	} else if (PageSwapCache(page)) {
 		ent.val = page_private(page);
 		id = lookup_swap_cgroup_id(ent);
 		rcu_read_lock();
 		memcg = mem_cgroup_lookup(id);
 		if (memcg && !css_tryget_online(&memcg->css))
 			memcg = NULL;
 		rcu_read_unlock();
 	}
 	return memcg;
 }
 static void lock_page_lru(struct page *page, int *isolated)
 {
 	struct zone *zone = page_zone(page);
 	spin_lock_irq(&zone->lru_lock);
 	if (PageLRU(page)) {
 		struct lruvec *lruvec;
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		ClearPageLRU(page);
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 		*isolated = 1;
 	} else
 		*isolated = 0;
 }
 static void unlock_page_lru(struct page *page, int isolated)
 {
 	struct zone *zone = page_zone(page);
 	if (isolated) {
 		struct lruvec *lruvec;
 		lruvec = mem_cgroup_page_lruvec(page, zone);
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		SetPageLRU(page);
 		add_page_to_lru_list(page, lruvec, page_lru(page));
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 			  bool lrucare)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
 	int isolated;
 	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
 	 * accessed by any other context at this point.
 	 */
 	/*
 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
 	 * may already be on some other mem_cgroup's LRU.  Take care of it.
 	 */
 	if (lrucare)
 		lock_page_lru(page, &isolated);
 	/*
 	 * Nobody should be changing or seriously looking at
 	 * pc->mem_cgroup and pc->flags at this point:
 	 *
 	 * - the page is uncharged
 	 *
 	 * - the page is off-LRU
 	 *
 	 * - an anonymous fault has exclusive page access, except for
 	 *   a locked page table
 	 *
 	 * - a page cache insertion, a swapin fault, or a migration
 	 *   have the page locked
 	 */
 	pc->mem_cgroup = memcg;
 	pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
 	if (lrucare)
 		unlock_page_lru(page, isolated);
 }
 #ifdef CONFIG_MEMCG_KMEM
 /*
  * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
  * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
  */
 static DEFINE_MUTEX(memcg_slab_mutex);
 static DEFINE_MUTEX(activate_kmem_mutex);
 /*
  * This is a bit cumbersome, but it is rarely used and avoids a backpointer
  * in the memcg_cache_params struct.
  */
 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 {
 	struct kmem_cache *cachep;
 	VM_BUG_ON(p->is_root_cache);
 	cachep = p->root_cache;
 	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
 }
 #ifdef CONFIG_SLABINFO
 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	struct memcg_cache_params *params;
 	if (!memcg_kmem_is_active(memcg))
 		return -EIO;
 	print_slabinfo_header(m);
 	mutex_lock(&memcg_slab_mutex);
 	list_for_each_entry(params, &memcg->memcg_slab_caches, list)
 		cache_show(memcg_params_to_cache(params), m);
 	mutex_unlock(&memcg_slab_mutex);
 	return 0;
 }
 #endif
 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 			     unsigned long nr_pages)
 {
 	struct page_counter *counter;
 	int ret = 0;
 	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
 	if (ret < 0)
 		return ret;
 	ret = try_charge(memcg, gfp, nr_pages);
 	if (ret == -EINTR)  {
 		/*
 		 * try_charge() chose to bypass to root due to OOM kill or
 		 * fatal signal.  Since our only options are to either fail
 		 * the allocation or charge it to this cgroup, do it as a
 		 * temporary condition. But we can't fail. From a kmem/slab
 		 * perspective, the cache has already been selected, by
 		 * mem_cgroup_kmem_get_cache(), so it is too late to change
 		 * our minds.
 		 *
 		 * This condition will only trigger if the task entered
 		 * memcg_charge_kmem in a sane state, but was OOM-killed
 		 * during try_charge() above. Tasks that were already dying
 		 * when the allocation triggers should have been already
 		 * directed to the root cgroup in memcontrol.h
 		 */
 		page_counter_charge(&memcg->memory, nr_pages);
 		if (do_swap_account)
 			page_counter_charge(&memcg->memsw, nr_pages);
+		css_get_many(&memcg->css, nr_pages);
 		ret = 0;
 	} else if (ret)
 		page_counter_uncharge(&memcg->kmem, nr_pages);
 	return ret;
 }
 static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
 				unsigned long nr_pages)
 {
 	page_counter_uncharge(&memcg->memory, nr_pages);
 	if (do_swap_account)
 		page_counter_uncharge(&memcg->memsw, nr_pages);
 	/* Not down to 0 */
-	if (page_counter_uncharge(&memcg->kmem, nr_pages))
+	if (page_counter_uncharge(&memcg->kmem, nr_pages)) {
+		css_put_many(&memcg->css, nr_pages);
 		return;
+	}
 	/*
 	 * Releases a reference taken in kmem_cgroup_css_offline in case
 	 * this last uncharge is racing with the offlining code or it is
 	 * outliving the memcg existence.
 	 *
 	 * The memory barrier imposed by test&clear is paired with the
 	 * explicit one in memcg_kmem_mark_dead().
 	 */
 	if (memcg_kmem_test_and_clear_dead(memcg))
 		css_put(&memcg->css);
+	css_put_many(&memcg->css, nr_pages);
 }
 /*
  * helper for acessing a memcg's index. It will be used as an index in the
  * child cache array in kmem_cache, and also to derive its name. This function
  * will return -1 when this is not a kmem-limited memcg.
  */
 int memcg_cache_id(struct mem_cgroup *memcg)
 {
 	return memcg ? memcg->kmemcg_id : -1;
 }
 static int memcg_alloc_cache_id(void)
 {
 	int id, size;
 	int err;
 	id = ida_simple_get(&kmem_limited_groups,
 			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
 	if (id < 0)
 		return id;
 	if (id < memcg_limited_groups_array_size)
 		return id;
 	/*
 	 * There's no space for the new id in memcg_caches arrays,
 	 * so we have to grow them.
 	 */
 	size = 2 * (id + 1);
 	if (size < MEMCG_CACHES_MIN_SIZE)
 		size = MEMCG_CACHES_MIN_SIZE;
 	else if (size > MEMCG_CACHES_MAX_SIZE)
 		size = MEMCG_CACHES_MAX_SIZE;
 	mutex_lock(&memcg_slab_mutex);
 	err = memcg_update_all_caches(size);
 	mutex_unlock(&memcg_slab_mutex);
 	if (err) {
 		ida_simple_remove(&kmem_limited_groups, id);
 		return err;
 	}
 	return id;
 }
 static void memcg_free_cache_id(int id)
 {
 	ida_simple_remove(&kmem_limited_groups, id);
 }
 /*
  * We should update the current array size iff all caches updates succeed. This
  * can only be done from the slab side. The slab mutex needs to be held when
  * calling this.
  */
 void memcg_update_array_size(int num)
 {
 	memcg_limited_groups_array_size = num;
 }
 static void memcg_register_cache(struct mem_cgroup *memcg,
 				 struct kmem_cache *root_cache)
 {
 	static char memcg_name_buf[NAME_MAX + 1]; /* protected by
 						     memcg_slab_mutex */
 	struct kmem_cache *cachep;
 	int id;
 	lockdep_assert_held(&memcg_slab_mutex);
 	id = memcg_cache_id(memcg);
 	/*
 	 * Since per-memcg caches are created asynchronously on first
 	 * allocation (see memcg_kmem_get_cache()), several threads can try to
 	 * create the same cache, but only one of them may succeed.
 	 */
 	if (cache_from_memcg_idx(root_cache, id))
 		return;
 	cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
 	cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
 	/*
 	 * If we could not create a memcg cache, do not complain, because
 	 * that's not critical at all as we can always proceed with the root
 	 * cache.
 	 */
 	if (!cachep)
 		return;
 	css_get(&memcg->css);
 	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
 	/*
 	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
 	 * barrier here to ensure nobody will see the kmem_cache partially
 	 * initialized.
 	 */
 	smp_wmb();
 	BUG_ON(root_cache->memcg_params->memcg_caches[id]);
 	root_cache->memcg_params->memcg_caches[id] = cachep;
 }
 static void memcg_unregister_cache(struct kmem_cache *cachep)
 {
 	struct kmem_cache *root_cache;
 	struct mem_cgroup *memcg;
 	int id;
 	lockdep_assert_held(&memcg_slab_mutex);
 	BUG_ON(is_root_cache(cachep));
 	root_cache = cachep->memcg_params->root_cache;
 	memcg = cachep->memcg_params->memcg;
 	id = memcg_cache_id(memcg);
 	BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
 	root_cache->memcg_params->memcg_caches[id] = NULL;
 	list_del(&cachep->memcg_params->list);
 	kmem_cache_destroy(cachep);
 	/* drop the reference taken in memcg_register_cache */
 	css_put(&memcg->css);
 }
 /*
  * During the creation a new cache, we need to disable our accounting mechanism
  * altogether. This is true even if we are not creating, but rather just
  * enqueing new caches to be created.
  *
  * This is because that process will trigger allocations; some visible, like
  * explicit kmallocs to auxiliary data structures, name strings and internal
  * cache structures; some well concealed, like INIT_WORK() that can allocate
  * objects during debug.
  *
  * If any allocation happens during memcg_kmem_get_cache, we will recurse back
  * to it. This may not be a bounded recursion: since the first cache creation
  * failed to complete (waiting on the allocation), we'll just try to create the
  * cache again, failing at the same point.
  *
  * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
  * memcg_kmem_skip_account. So we enclose anything that might allocate memory
  * inside the following two functions.
  */
 static inline void memcg_stop_kmem_account(void)
 {
 	VM_BUG_ON(!current->mm);
 	current->memcg_kmem_skip_account++;
 }
 static inline void memcg_resume_kmem_account(void)
 {
 	VM_BUG_ON(!current->mm);
 	current->memcg_kmem_skip_account--;
 }
 int __memcg_cleanup_cache_params(struct kmem_cache *s)
 {
 	struct kmem_cache *c;
 	int i, failed = 0;
 	mutex_lock(&memcg_slab_mutex);
 	for_each_memcg_cache_index(i) {
 		c = cache_from_memcg_idx(s, i);
 		if (!c)
 			continue;
 		memcg_unregister_cache(c);
 		if (cache_from_memcg_idx(s, i))
 			failed++;
 	}
 	mutex_unlock(&memcg_slab_mutex);
 	return failed;
 }
 static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 {
 	struct kmem_cache *cachep;
 	struct memcg_cache_params *params, *tmp;
 	if (!memcg_kmem_is_active(memcg))
 		return;
 	mutex_lock(&memcg_slab_mutex);
 	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
 		cachep = memcg_params_to_cache(params);
 		kmem_cache_shrink(cachep);
 		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
 			memcg_unregister_cache(cachep);
 	}
 	mutex_unlock(&memcg_slab_mutex);
 }
 struct memcg_register_cache_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
 	struct work_struct work;
 };
 static void memcg_register_cache_func(struct work_struct *w)
 {
 	struct memcg_register_cache_work *cw =
 		container_of(w, struct memcg_register_cache_work, work);
 	struct mem_cgroup *memcg = cw->memcg;
 	struct kmem_cache *cachep = cw->cachep;
 	mutex_lock(&memcg_slab_mutex);
 	memcg_register_cache(memcg, cachep);
 	mutex_unlock(&memcg_slab_mutex);
 	css_put(&memcg->css);
 	kfree(cw);
 }
 /*
  * Enqueue the creation of a per-memcg kmem_cache.
  */
 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
 					    struct kmem_cache *cachep)
 {
 	struct memcg_register_cache_work *cw;
 	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
 	if (cw == NULL) {
 		css_put(&memcg->css);
 		return;
 	}
 	cw->memcg = memcg;
 	cw->cachep = cachep;
 	INIT_WORK(&cw->work, memcg_register_cache_func);
 	schedule_work(&cw->work);
 }
 static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
 					  struct kmem_cache *cachep)
 {
 	/*
 	 * We need to stop accounting when we kmalloc, because if the
 	 * corresponding kmalloc cache is not yet created, the first allocation
 	 * in __memcg_schedule_register_cache will recurse.
 	 *
 	 * However, it is better to enclose the whole function. Depending on
 	 * the debugging options enabled, INIT_WORK(), for instance, can
 	 * trigger an allocation. This too, will make us recurse. Because at
 	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
 	 * the safest choice is to do it like this, wrapping the whole function.
 	 */
 	memcg_stop_kmem_account();
 	__memcg_schedule_register_cache(memcg, cachep);
 	memcg_resume_kmem_account();
 }
 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
 {
 	unsigned int nr_pages = 1 << order;
 	int res;
 	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
 	if (!res)
 		atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
 	return res;
 }
 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
 {
 	unsigned int nr_pages = 1 << order;
 	memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
 	atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
 }
 /*
  * Return the kmem_cache we're supposed to use for a slab allocation.
  * We try to use the current memcg's version of the cache.
  *
  * If the cache does not exist yet, if we are the first user of it,
  * we either create it immediately, if possible, or create it asynchronously
  * in a workqueue.
  * In the latter case, we will let the current allocation go through with
  * the original cache.
  *
  * Can't be called in interrupt context or from kernel threads.
  * This function needs to be called with rcu_read_lock() held.
  */
 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 					  gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *memcg_cachep;
 	VM_BUG_ON(!cachep->memcg_params);
 	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
 	if (!current->mm || current->memcg_kmem_skip_account)
 		return cachep;
 	rcu_read_lock();
 	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
 	if (!memcg_kmem_is_active(memcg))
 		goto out;
 	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
 	if (likely(memcg_cachep)) {
 		cachep = memcg_cachep;
 		goto out;
 	}
 	/* The corresponding put will be done in the workqueue. */
 	if (!css_tryget_online(&memcg->css))
 		goto out;
 	rcu_read_unlock();
 	/*
 	 * If we are in a safe context (can wait, and not in interrupt
 	 * context), we could be be predictable and return right away.
 	 * This would guarantee that the allocation being performed
 	 * already belongs in the new cache.
 	 *
 	 * However, there are some clashes that can arrive from locking.
 	 * For instance, because we acquire the slab_mutex while doing
 	 * memcg_create_kmem_cache, this means no further allocation
 	 * could happen with the slab_mutex held. So it's better to
 	 * defer everything.
 	 */
 	memcg_schedule_register_cache(memcg, cachep);
 	return cachep;
 out:
 	rcu_read_unlock();
 	return cachep;
 }
 /*
  * We need to verify if the allocation against current->mm->owner's memcg is
  * possible for the given order. But the page is not allocated yet, so we'll
  * need a further commit step to do the final arrangements.
  *
  * It is possible for the task to switch cgroups in this mean time, so at
  * commit time, we can't rely on task conversion any longer.  We'll then use
  * the handle argument to return to the caller which cgroup we should commit
  * against. We could also return the memcg directly and avoid the pointer
  * passing, but a boolean return value gives better semantics considering
  * the compiled-out case as well.
  *
  * Returning true means the allocation is possible.
  */
 bool
 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
 {
 	struct mem_cgroup *memcg;
 	int ret;
 	*_memcg = NULL;
 	/*
 	 * Disabling accounting is only relevant for some specific memcg
 	 * internal allocations. Therefore we would initially not have such
 	 * check here, since direct calls to the page allocator that are
 	 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
 	 * outside memcg core. We are mostly concerned with cache allocations,
 	 * and by having this test at memcg_kmem_get_cache, we are already able
 	 * to relay the allocation to the root cache and bypass the memcg cache
 	 * altogether.
 	 *
 	 * There is one exception, though: the SLUB allocator does not create
 	 * large order caches, but rather service large kmallocs directly from
 	 * the page allocator. Therefore, the following sequence when backed by
 	 * the SLUB allocator:
 	 *
 	 *	memcg_stop_kmem_account();
 	 *	kmalloc(<large_number>)
 	 *	memcg_resume_kmem_account();
 	 *
 	 * would effectively ignore the fact that we should skip accounting,
 	 * since it will drive us directly to this function without passing
 	 * through the cache selector memcg_kmem_get_cache. Such large
 	 * allocations are extremely rare but can happen, for instance, for the
 	 * cache arrays. We bring this test here.
 	 */
 	if (!current->mm || current->memcg_kmem_skip_account)
 		return true;
 	memcg = get_mem_cgroup_from_mm(current->mm);
 	if (!memcg_kmem_is_active(memcg)) {
 		css_put(&memcg->css);
 		return true;
 	}
 	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
 	if (!ret)
 		*_memcg = memcg;
 	css_put(&memcg->css);
 	return (ret == 0);
 }
 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
 			      int order)
 {
 	struct page_cgroup *pc;
 	VM_BUG_ON(mem_cgroup_is_root(memcg));
 	/* The page allocation failed. Revert */
 	if (!page) {
 		memcg_uncharge_kmem(memcg, 1 << order);
 		return;
 	}
 	/*
 	 * The page is freshly allocated and not visible to any
 	 * outside callers yet.  Set up pc non-atomically.
 	 */
 	pc = lookup_page_cgroup(page);
 	pc->mem_cgroup = memcg;
 	pc->flags = PCG_USED;
 }
 void __memcg_kmem_uncharge_pages(struct page *page, int order)
 {
 	struct mem_cgroup *memcg = NULL;
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	if (!PageCgroupUsed(pc))
 		return;
 	memcg = pc->mem_cgroup;
 	pc->flags = 0;
 	/*
 	 * We trust that only if there is a memcg associated with the page, it
 	 * is a valid allocation
 	 */
 	if (!memcg)
 		return;
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 	memcg_uncharge_kmem(memcg, 1 << order);
 }
 #else
 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
  * charge/uncharge will be never happen and move_account() is done under
  * compound_lock(), so we don't have to take care of races.
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
 	struct page_cgroup *head_pc = lookup_page_cgroup(head);
 	struct page_cgroup *pc;
 	struct mem_cgroup *memcg;
 	int i;
 	if (mem_cgroup_disabled())
 		return;
 	memcg = head_pc->mem_cgroup;
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		pc = head_pc + i;
 		pc->mem_cgroup = memcg;
 		pc->flags = head_pc->flags;
 	}
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 		       HPAGE_PMD_NR);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 /**
  * mem_cgroup_move_account - move account of the page
  * @page: the page
  * @nr_pages: number of regular pages (>1 for huge pages)
  * @pc:	page_cgroup of the page.
  * @from: mem_cgroup which the page is moved from.
  * @to:	mem_cgroup which the page is moved to. @from != @to.
  *
  * The caller must confirm following.
  * - page is not on LRU (isolate_page() is useful.)
  * - compound_lock is held when nr_pages > 1
  *
  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
  * from old cgroup.
  */
 static int mem_cgroup_move_account(struct page *page,
 				   unsigned int nr_pages,
 				   struct page_cgroup *pc,
 				   struct mem_cgroup *from,
 				   struct mem_cgroup *to)
 {
 	unsigned long flags;
 	int ret;
 	VM_BUG_ON(from == to);
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 	/*
 	 * The page is isolated from LRU. So, collapse function
 	 * will not handle this page. But page splitting can happen.
 	 * Do this check under compound_page_lock(). The caller should
 	 * hold it.
 	 */
 	ret = -EBUSY;
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 	/*
 	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
 	 * of its source page while we change it: page migration takes
 	 * both pages off the LRU, but page cache replacement doesn't.
 	 */
 	if (!trylock_page(page))
 		goto out;
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
 		goto out_unlock;
 	move_lock_mem_cgroup(from, &flags);
 	if (!PageAnon(page) && page_mapped(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
 			       nr_pages);
 	}
 	if (PageWriteback(page)) {
 		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
 			       nr_pages);
 		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
 			       nr_pages);
 	}
 	/*
 	 * It is safe to change pc->mem_cgroup here because the page
 	 * is referenced, charged, and isolated - we can't race with
 	 * uncharging, charging, migration, or LRU putback.
 	 */
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
 	move_unlock_mem_cgroup(from, &flags);
 	ret = 0;
 	local_irq_disable();
 	mem_cgroup_charge_statistics(to, page, nr_pages);
 	memcg_check_events(to, page);
 	mem_cgroup_charge_statistics(from, page, -nr_pages);
 	memcg_check_events(from, page);
 	local_irq_enable();
 out_unlock:
 	unlock_page(page);
 out:
 	return ret;
 }
 /**
  * mem_cgroup_move_parent - moves page to the parent group
  * @page: the page to move
  * @pc: page_cgroup of the page
  * @child: page's cgroup
  *
  * move charges to its parent or the root cgroup if the group has no
  * parent (aka use_hierarchy==0).
  * Although this might fail (get_page_unless_zero, isolate_lru_page or
  * mem_cgroup_move_account fails) the failure is always temporary and
  * it signals a race with a page removal/uncharge or migration. In the
  * first case the page is on the way out and it will vanish from the LRU
  * on the next attempt and the call should be retried later.
  * Isolation from the LRU fails only if page has been isolated from
  * the LRU since we looked at it and that usually means either global
  * reclaim or migration going on. The page will either get back to the
  * LRU or vanish.
  * Finaly mem_cgroup_move_account fails only if the page got uncharged
  * (!PageCgroupUsed) or moved to a different group. The page will
  * disappear in the next attempt.
  */
 static int mem_cgroup_move_parent(struct page *page,
 				  struct page_cgroup *pc,
 				  struct mem_cgroup *child)
 {
 	struct mem_cgroup *parent;
 	unsigned int nr_pages;
 	unsigned long uninitialized_var(flags);
 	int ret;
 	VM_BUG_ON(mem_cgroup_is_root(child));
 	ret = -EBUSY;
 	if (!get_page_unless_zero(page))
 		goto out;
 	if (isolate_lru_page(page))
 		goto put;
 	nr_pages = hpage_nr_pages(page);
 	parent = parent_mem_cgroup(child);
 	/*
 	 * If no parent, move charges to root cgroup.
 	 */
 	if (!parent)
 		parent = root_mem_cgroup;
 	if (nr_pages > 1) {
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 		flags = compound_lock_irqsave(page);
 	}
 	ret = mem_cgroup_move_account(page, nr_pages,
 				pc, child, parent);
 	if (!ret) {
+		if (!mem_cgroup_is_root(parent))
+			css_get_many(&parent->css, nr_pages);
 		/* Take charge off the local counters */
 		page_counter_cancel(&child->memory, nr_pages);
 		if (do_swap_account)
 			page_counter_cancel(&child->memsw, nr_pages);
+		css_put_many(&child->css, nr_pages);
 	}
 	if (nr_pages > 1)
 		compound_unlock_irqrestore(page, flags);
 	putback_lru_page(page);
 put:
 	put_page(page);
 out:
 	return ret;
 }
 #ifdef CONFIG_MEMCG_SWAP
 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
 					 bool charge)
 {
 	int val = (charge) ? 1 : -1;
 	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
  * @from:  mem_cgroup which the entry is moved from
  * @to:  mem_cgroup which the entry is moved to
  *
  * It succeeds only when the swap_cgroup's record for this entry is the same
  * as the mem_cgroup's id of @from.
  *
  * Returns 0 on success, -EINVAL on failure.
  *
  * The caller must have charged to @to, IOW, called page_counter_charge() about
  * both res and memsw, and called css_get().
  */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	unsigned short old_id, new_id;
 	old_id = mem_cgroup_id(from);
 	new_id = mem_cgroup_id(to);
 	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
 		mem_cgroup_swap_statistics(from, false);
 		mem_cgroup_swap_statistics(to, true);
 		/*
 		 * This function is only called from task migration context now.
 		 * It postpones page_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
 		 * improvement. But we cannot postpone css_get(to)  because if
 		 * the process that has been moved to @to does swap-in, the
 		 * refcount of @to might be decreased to 0.
 		 *
 		 * We are in attach() phase, so the cgroup is guaranteed to be
 		 * alive, so we can just call css_get().
 		 */
 		css_get(&to->css);
 		return 0;
 	}
 	return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 				struct mem_cgroup *from, struct mem_cgroup *to)
 {
 	return -EINVAL;
 }
 #endif
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup(page);
 	/*
 	 * Can be NULL while feeding pages into the page allocator for
 	 * the first time, i.e. during boot or memory hotplug;
 	 * or when mem_cgroup_disabled().
 	 */
 	if (likely(pc) && PageCgroupUsed(pc))
 		return pc;
 	return NULL;
 }
 bool mem_cgroup_bad_page_check(struct page *page)
 {
 	if (mem_cgroup_disabled())
 		return false;
 	return lookup_page_cgroup_used(page) != NULL;
 }
 void mem_cgroup_print_bad_page(struct page *page)
 {
 	struct page_cgroup *pc;
 	pc = lookup_page_cgroup_used(page);
 	if (pc) {
 		pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
 			 pc, pc->flags, pc->mem_cgroup);
 	}
 }
 #endif
 static DEFINE_MUTEX(memcg_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
 				   unsigned long limit)
 {
 	unsigned long curusage;
 	unsigned long oldusage;
 	bool enlarge = false;
 	int retry_count;
 	int ret;
 	/*
 	 * For keeping hierarchical_reclaim simple, how long we should retry
 	 * is depends on callers. We set our retry-count to be function
 	 * of # of children which we should visit in this loop.
 	 */
 	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
 		      mem_cgroup_count_children(memcg);
 	oldusage = page_counter_read(&memcg->memory);
 	do {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		mutex_lock(&memcg_limit_mutex);
 		if (limit > memcg->memsw.limit) {
 			mutex_unlock(&memcg_limit_mutex);
 			ret = -EINVAL;
 			break;
 		}
 		if (limit > memcg->memory.limit)
 			enlarge = true;
 		ret = page_counter_limit(&memcg->memory, limit);
 		mutex_unlock(&memcg_limit_mutex);
 		if (!ret)
 			break;
 		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
 		curusage = page_counter_read(&memcg->memory);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	} while (retry_count);
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 					 unsigned long limit)
 {
 	unsigned long curusage;
 	unsigned long oldusage;
 	bool enlarge = false;
 	int retry_count;
 	int ret;
 	/* see mem_cgroup_resize_res_limit */
 	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
 		      mem_cgroup_count_children(memcg);
 	oldusage = page_counter_read(&memcg->memsw);
 	do {
 		if (signal_pending(current)) {
 			ret = -EINTR;
 			break;
 		}
 		mutex_lock(&memcg_limit_mutex);
 		if (limit < memcg->memory.limit) {
 			mutex_unlock(&memcg_limit_mutex);
 			ret = -EINVAL;
 			break;
 		}
 		if (limit > memcg->memsw.limit)
 			enlarge = true;
 		ret = page_counter_limit(&memcg->memsw, limit);
 		mutex_unlock(&memcg_limit_mutex);
 		if (!ret)
 			break;
 		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
 		curusage = page_counter_read(&memcg->memsw);
 		/* Usage is reduced ? */
 		if (curusage >= oldusage)
 			retry_count--;
 		else
 			oldusage = curusage;
 	} while (retry_count);
 	if (!ret && enlarge)
 		memcg_oom_recover(memcg);
 	return ret;
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 					    gfp_t gfp_mask,
 					    unsigned long *total_scanned)
 {
 	unsigned long nr_reclaimed = 0;
 	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
 	unsigned long reclaimed;
 	int loop = 0;
 	struct mem_cgroup_tree_per_zone *mctz;
 	unsigned long excess;
 	unsigned long nr_scanned;
 	if (order > 0)
 		return 0;
 	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
 	/*
 	 * This loop can run a while, specially if mem_cgroup's continuously
 	 * keep exceeding their soft limit and putting the system under
 	 * pressure
 	 */
 	do {
 		if (next_mz)
 			mz = next_mz;
 		else
 			mz = mem_cgroup_largest_soft_limit_node(mctz);
 		if (!mz)
 			break;
 		nr_scanned = 0;
 		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
 						    gfp_mask, &nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
 		spin_lock_irq(&mctz->lock);
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
 		 * it is time to move on to the next cgroup
 		 */
 		next_mz = NULL;
 		if (!reclaimed) {
 			do {
 				/*
 				 * Loop until we find yet another one.
 				 *
 				 * By the time we get the soft_limit lock
 				 * again, someone might have aded the
 				 * group back on the RB tree. Iterate to
 				 * make sure we get a different mem.
 				 * mem_cgroup_largest_soft_limit_node returns
 				 * NULL if no other cgroup is present on
 				 * the tree
 				 */
 				next_mz =
 				__mem_cgroup_largest_soft_limit_node(mctz);
 				if (next_mz == mz)
 					css_put(&next_mz->memcg->css);
 				else /* next_mz == NULL or other memcg */
 					break;
 			} while (1);
 		}
 		__mem_cgroup_remove_exceeded(mz, mctz);
 		excess = soft_limit_excess(mz->memcg);
 		/*
 		 * One school of thought says that we should not add
 		 * back the node to the tree if reclaim returns 0.
 		 * But our reclaim could return 0, simply because due
 		 * to priority we are exposing a smaller subset of
 		 * memory to reclaim from. Consider this as a longer
 		 * term TODO.
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz, mctz, excess);
 		spin_unlock_irq(&mctz->lock);
 		css_put(&mz->memcg->css);
 		loop++;
 		/*
 		 * Could not reclaim anything and there are no more
 		 * mem cgroups to try or we seem to be looping without
 		 * reclaiming anything.
 		 */
 		if (!nr_reclaimed &&
 			(next_mz == NULL ||
 			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
 			break;
 	} while (!nr_reclaimed);
 	if (next_mz)
 		css_put(&next_mz->memcg->css);
 	return nr_reclaimed;
 }
 /**
  * mem_cgroup_force_empty_list - clears LRU of a group
  * @memcg: group to clear
  * @node: NUMA node
  * @zid: zone id
  * @lru: lru to to clear
  *
  * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
  * reclaim the pages page themselves - pages are moved to the parent (or root)
  * group.
  */
 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 				int node, int zid, enum lru_list lru)
 {
 	struct lruvec *lruvec;
 	unsigned long flags;
 	struct list_head *list;
 	struct page *busy;
 	struct zone *zone;
 	zone = &NODE_DATA(node)->node_zones[zid];
 	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 	list = &lruvec->lists[lru];
 	busy = NULL;
 	do {
 		struct page_cgroup *pc;
 		struct page *page;
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		if (list_empty(list)) {
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			break;
 		}
 		page = list_entry(list->prev, struct page, lru);
 		if (busy == page) {
 			list_move(&page->lru, list);
 			busy = NULL;
 			spin_unlock_irqrestore(&zone->lru_lock, flags);
 			continue;
 		}
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 		pc = lookup_page_cgroup(page);
 		if (mem_cgroup_move_parent(page, pc, memcg)) {
 			/* found lock contention or "pc" is obsolete. */
 			busy = page;
 		} else
 			busy = NULL;
 		cond_resched();
 	} while (!list_empty(list));
 }
 /*
  * make mem_cgroup's charge to be 0 if there is no task by moving
  * all the charges and pages to the parent.
  * This enables deleting this mem_cgroup.
  *
  * Caller is responsible for holding css reference on the memcg.
  */
 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 {
 	int node, zid;
 	do {
 		/* This is for making all *used* pages to be on LRU. */
 		lru_add_drain_all();
 		drain_all_stock_sync(memcg);
 		mem_cgroup_start_move(memcg);
 		for_each_node_state(node, N_MEMORY) {
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				enum lru_list lru;
 				for_each_lru(lru) {
 					mem_cgroup_force_empty_list(memcg,
 							node, zid, lru);
 				}
 			}
 		}
 		mem_cgroup_end_move(memcg);
 		memcg_oom_recover(memcg);
 		cond_resched();
 		/*
 		 * Kernel memory may not necessarily be trackable to a specific
 		 * process. So they are not migrated, and therefore we can't
 		 * expect their value to drop to 0 here.
 		 * Having res filled up with kmem only is enough.
 		 *
 		 * This is a safety check because mem_cgroup_force_empty_list
 		 * could have raced with mem_cgroup_replace_page_cache callers
 		 * so the lru seemed empty but the page could have been added
 		 * right after the check. RES_USAGE should be safe as we always
 		 * charge before adding to the LRU.
 		 */
 	} while (page_counter_read(&memcg->memory) -
 		 page_counter_read(&memcg->kmem) > 0);
 }
 /*
  * Test whether @memcg has children, dead or alive.  Note that this
  * function doesn't care whether @memcg has use_hierarchy enabled and
  * returns %true if there are child csses according to the cgroup
  * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
  */
 static inline bool memcg_has_children(struct mem_cgroup *memcg)
 {
 	bool ret;
 	/*
 	 * The lock does not prevent addition or deletion of children, but
 	 * it prevents a new child from being initialized based on this
 	 * parent in css_online(), so it's enough to decide whether
 	 * hierarchically inherited attributes can still be changed or not.
 	 */
 	lockdep_assert_held(&memcg_create_mutex);
 	rcu_read_lock();
 	ret = css_next_child(NULL, &memcg->css);
 	rcu_read_unlock();
 	return ret;
 }
 /*
  * Reclaims as many pages from the given memcg as possible and moves
  * the rest to the parent.
  *
  * Caller is responsible for holding css reference for memcg.
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 {
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	/* we call try-to-free pages for make this cgroup empty */
 	lru_add_drain_all();
 	/* try to free all pages in this cgroup */
 	while (nr_retries && page_counter_read(&memcg->memory)) {
 		int progress;
 		if (signal_pending(current))
 			return -EINTR;
 		progress = try_to_free_mem_cgroup_pages(memcg, 1,
 							GFP_KERNEL, true);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
 			congestion_wait(BLK_RW_ASYNC, HZ/10);
 		}
 	}
 	return 0;
 }
 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
 					    char *buf, size_t nbytes,
 					    loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	if (mem_cgroup_is_root(memcg))
 		return -EINVAL;
 	return mem_cgroup_force_empty(memcg) ?: nbytes;
 }
 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
 				     struct cftype *cft)
 {
 	return mem_cgroup_from_css(css)->use_hierarchy;
 }
 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 				      struct cftype *cft, u64 val)
 {
 	int retval = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
 	mutex_lock(&memcg_create_mutex);
 	if (memcg->use_hierarchy == val)
 		goto out;
 	/*
 	 * If parent's use_hierarchy is set, we can't make any modifications
 	 * in the child subtrees. If it is unset, then the change can
 	 * occur, provided the current cgroup has no children.
 	 *
 	 * For the root cgroup, parent_mem is NULL, we allow value to be
 	 * set if there are no children.
 	 */
 	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
 				(val == 1 || val == 0)) {
 		if (!memcg_has_children(memcg))
 			memcg->use_hierarchy = val;
 		else
 			retval = -EBUSY;
 	} else
 		retval = -EINVAL;
 out:
 	mutex_unlock(&memcg_create_mutex);
 	return retval;
 }
 static unsigned long tree_stat(struct mem_cgroup *memcg,
 			       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
 	long val = 0;
 	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, memcg)
 		val += mem_cgroup_read_stat(iter, idx);
 	if (val < 0) /* race ? */
 		val = 0;
 	return val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
 	if (mem_cgroup_is_root(memcg)) {
 		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
 		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
 		if (swap)
 			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
 	} else {
 		if (!swap)
 			val = page_counter_read(&memcg->memory);
 		else
 			val = page_counter_read(&memcg->memsw);
 	}
 	return val << PAGE_SHIFT;
 }
 enum {
 	RES_USAGE,
 	RES_LIMIT,
 	RES_MAX_USAGE,
 	RES_FAILCNT,
 	RES_SOFT_LIMIT,
 };
 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 			       struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct page_counter *counter;
 	switch (MEMFILE_TYPE(cft->private)) {
 	case _MEM:
 		counter = &memcg->memory;
 		break;
 	case _MEMSWAP:
 		counter = &memcg->memsw;
 		break;
 	case _KMEM:
 		counter = &memcg->kmem;
 		break;
 	default:
 		BUG();
 	}
 	switch (MEMFILE_ATTR(cft->private)) {
 	case RES_USAGE:
 		if (counter == &memcg->memory)
 			return mem_cgroup_usage(memcg, false);
 		if (counter == &memcg->memsw)
 			return mem_cgroup_usage(memcg, true);
 		return (u64)page_counter_read(counter) * PAGE_SIZE;
 	case RES_LIMIT:
 		return (u64)counter->limit * PAGE_SIZE;
 	case RES_MAX_USAGE:
 		return (u64)counter->watermark * PAGE_SIZE;
 	case RES_FAILCNT:
 		return counter->failcnt;
 	case RES_SOFT_LIMIT:
 		return (u64)memcg->soft_limit * PAGE_SIZE;
 	default:
 		BUG();
 	}
 }
 #ifdef CONFIG_MEMCG_KMEM
 /* should be called with activate_kmem_mutex held */
 static int __memcg_activate_kmem(struct mem_cgroup *memcg,
 				 unsigned long nr_pages)
 {
 	int err = 0;
 	int memcg_id;
 	if (memcg_kmem_is_active(memcg))
 		return 0;
 	/*
 	 * We are going to allocate memory for data shared by all memory
 	 * cgroups so let's stop accounting here.
 	 */
 	memcg_stop_kmem_account();
 	/*
 	 * For simplicity, we won't allow this to be disabled.  It also can't
 	 * be changed if the cgroup has children already, or if tasks had
 	 * already joined.
 	 *
 	 * If tasks join before we set the limit, a person looking at
 	 * kmem.usage_in_bytes will have no way to determine when it took
 	 * place, which makes the value quite meaningless.
 	 *
 	 * After it first became limited, changes in the value of the limit are
 	 * of course permitted.
 	 */
 	mutex_lock(&memcg_create_mutex);
 	if (cgroup_has_tasks(memcg->css.cgroup) ||
 	    (memcg->use_hierarchy && memcg_has_children(memcg)))
 		err = -EBUSY;
 	mutex_unlock(&memcg_create_mutex);
 	if (err)
 		goto out;
 	memcg_id = memcg_alloc_cache_id();
 	if (memcg_id < 0) {
 		err = memcg_id;
 		goto out;
 	}
 	memcg->kmemcg_id = memcg_id;
 	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
 	/*
 	 * We couldn't have accounted to this cgroup, because it hasn't got the
 	 * active bit set yet, so this should succeed.
 	 */
 	err = page_counter_limit(&memcg->kmem, nr_pages);
 	VM_BUG_ON(err);
 	static_key_slow_inc(&memcg_kmem_enabled_key);
 	/*
 	 * Setting the active bit after enabling static branching will
 	 * guarantee no one starts accounting before all call sites are
 	 * patched.
 	 */
 	memcg_kmem_set_active(memcg);
 out:
 	memcg_resume_kmem_account();
 	return err;
 }
 static int memcg_activate_kmem(struct mem_cgroup *memcg,
 			       unsigned long nr_pages)
 {
 	int ret;
 	mutex_lock(&activate_kmem_mutex);
 	ret = __memcg_activate_kmem(memcg, nr_pages);
 	mutex_unlock(&activate_kmem_mutex);
 	return ret;
 }
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long limit)
 {
 	int ret;
 	mutex_lock(&memcg_limit_mutex);
 	if (!memcg_kmem_is_active(memcg))
 		ret = memcg_activate_kmem(memcg, limit);
 	else
 		ret = page_counter_limit(&memcg->kmem, limit);
 	mutex_unlock(&memcg_limit_mutex);
 	return ret;
 }
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
 	int ret = 0;
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
 	if (!parent)
 		return 0;
 	mutex_lock(&activate_kmem_mutex);
 	/*
 	 * If the parent cgroup is not kmem-active now, it cannot be activated
 	 * after this point, because it has at least one child already.
 	 */
 	if (memcg_kmem_is_active(parent))
 		ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
 	mutex_unlock(&activate_kmem_mutex);
 	return ret;
 }
 #else
 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
 				   unsigned long limit)
 {
 	return -EINVAL;
 }
 #endif /* CONFIG_MEMCG_KMEM */
 /*
  * The user of this function is...
  * RES_LIMIT.
  */
 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
 				char *buf, size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	unsigned long nr_pages;
 	int ret;
 	buf = strstrip(buf);
 	ret = page_counter_memparse(buf, &nr_pages);
 	if (ret)
 		return ret;
 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
 	case RES_LIMIT:
 		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
 			ret = -EINVAL;
 			break;
 		}
 		switch (MEMFILE_TYPE(of_cft(of)->private)) {
 		case _MEM:
 			ret = mem_cgroup_resize_limit(memcg, nr_pages);
 			break;
 		case _MEMSWAP:
 			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
 			break;
 		case _KMEM:
 			ret = memcg_update_kmem_limit(memcg, nr_pages);
 			break;
 		}
 		break;
 	case RES_SOFT_LIMIT:
 		memcg->soft_limit = nr_pages;
 		ret = 0;
 		break;
 	}
 	return ret ?: nbytes;
 }
 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
 				size_t nbytes, loff_t off)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
 	struct page_counter *counter;
 	switch (MEMFILE_TYPE(of_cft(of)->private)) {
 	case _MEM:
 		counter = &memcg->memory;
 		break;
 	case _MEMSWAP:
 		counter = &memcg->memsw;
 		break;
 	case _KMEM:
 		counter = &memcg->kmem;
 		break;
 	default:
 		BUG();
 	}
 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
 	case RES_MAX_USAGE:
 		page_counter_reset_watermark(counter);
 		break;
 	case RES_FAILCNT:
 		counter->failcnt = 0;
 		break;
 	default:
 		BUG();
 	}
 	return nbytes;
 }
 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
 					struct cftype *cft)
 {
 	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
 }
 #ifdef CONFIG_MMU
 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 					struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	if (val >= (1 << NR_MOVE_TYPE))
 		return -EINVAL;
 	/*
 	 * No kind of locking is needed in here, because ->can_attach() will
 	 * check this value once in the beginning of the process, and then carry
 	 * on with stale data. This means that changes to this value will only
 	 * affect task migrations starting after the change.
 	 */
 	memcg->move_charge_at_immigrate = val;
 	return 0;
 }
 #else
 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 					struct cftype *cft, u64 val)
 {
 	return -ENOSYS;
 }
 #endif
 #ifdef CONFIG_NUMA
 static int memcg_numa_stat_show(struct seq_file *m, void *v)
 {
 	struct numa_stat {
 		const char *name;
 		unsigned int lru_mask;
 	};
 	static const struct numa_stat stats[] = {
 		{ "total", LRU_ALL },
 		{ "file", LRU_ALL_FILE },
 		{ "anon", LRU_ALL_ANON },
 		{ "unevictable", BIT(LRU_UNEVICTABLE) },
 	};
 	const struct numa_stat *stat;
 	int nid;
 	unsigned long nr;
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
 		seq_printf(m, "%s=%lu", stat->name, nr);
 		for_each_node_state(nid, N_MEMORY) {
 			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
 							  stat->lru_mask);
 			seq_printf(m, " N%d=%lu", nid, nr);
 		}
 		seq_putc(m, '\n');
 	}
 	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
 		struct mem_cgroup *iter;
 		nr = 0;
 		for_each_mem_cgroup_tree(iter, memcg)
 			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
 		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
 		for_each_node_state(nid, N_MEMORY) {
 			nr = 0;
 			for_each_mem_cgroup_tree(iter, memcg)
 				nr += mem_cgroup_node_nr_lru_pages(
 					iter, nid, stat->lru_mask);
 			seq_printf(m, " N%d=%lu", nid, nr);
 		}
 		seq_putc(m, '\n');
 	}
 	return 0;
 }
 #endif /* CONFIG_NUMA */
 static inline void mem_cgroup_lru_names_not_uptodate(void)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
 static int memcg_stat_show(struct seq_file *m, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
 	unsigned long memory, memsw;
 	struct mem_cgroup *mi;
 	unsigned int i;
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
 			   mem_cgroup_read_events(memcg, i));
 	for (i = 0; i < NR_LRU_LISTS; i++)
 		seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
 			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
 	/* Hierarchical information */
 	memory = memsw = PAGE_COUNTER_MAX;
 	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
 		memory = min(memory, mi->memory.limit);
 		memsw = min(memsw, mi->memsw.limit);
 	}
 	seq_printf(m, "hierarchical_memory_limit %llu\n",
 		   (u64)memory * PAGE_SIZE);
 	if (do_swap_account)
 		seq_printf(m, "hierarchical_memsw_limit %llu\n",
 			   (u64)memsw * PAGE_SIZE);
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		long long val = 0;
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
 		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
 	}
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_events(mi, i);
 		seq_printf(m, "total_%s %llu\n",
 			   mem_cgroup_events_names[i], val);
 	}
 	for (i = 0; i < NR_LRU_LISTS; i++) {
 		unsigned long long val = 0;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
 		seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
 	}
 #ifdef CONFIG_DEBUG_VM
 	{
 		int nid, zid;
 		struct mem_cgroup_per_zone *mz;
 		struct zone_reclaim_stat *rstat;
 		unsigned long recent_rotated[2] = {0, 0};
 		unsigned long recent_scanned[2] = {0, 0};
 		for_each_online_node(nid)
 			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 				rstat = &mz->lruvec.reclaim_stat;
 				recent_rotated[0] += rstat->recent_rotated[0];
 				recent_rotated[1] += rstat->recent_rotated[1];
 				recent_scanned[0] += rstat->recent_scanned[0];
 				recent_scanned[1] += rstat->recent_scanned[1];
 			}
 		seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
 		seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
 		seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
 		seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
 	}
 #endif
 	return 0;
 }
 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
 				      struct cftype *cft)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	return mem_cgroup_swappiness(memcg);
 }
 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 				       struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	if (val > 100)
 		return -EINVAL;
 	if (css->parent)
 		memcg->swappiness = val;
 	else
 		vm_swappiness = val;
 	return 0;
 }
 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
 {
 	struct mem_cgroup_threshold_ary *t;
 	unsigned long usage;
 	int i;
 	rcu_read_lock();
 	if (!swap)
 		t = rcu_dereference(memcg->thresholds.primary);
 	else
 		t = rcu_dereference(memcg->memsw_thresholds.primary);
 	if (!t)
 		goto unlock;
 	usage = mem_cgroup_usage(memcg, swap);
 	/*
 	 * current_threshold points to threshold just below or equal to usage.
 	 * If it's not true, a threshold was crossed after last
 	 * call of __mem_cgroup_threshold().
 	 */
 	i = t->current_threshold;
 	/*
 	 * Iterate backward over array of thresholds starting from
 	 * current_threshold and check if a threshold is crossed.
 	 * If none of thresholds below usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* i = current_threshold + 1 */
 	i++;
 	/*
 	 * Iterate forward over array of thresholds starting from
 	 * current_threshold+1 and check if a threshold is crossed.
 	 * If none of thresholds above usage is crossed, we read
 	 * only one element of the array here.
 	 */
 	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
 		eventfd_signal(t->entries[i].eventfd, 1);
 	/* Update current_threshold */
 	t->current_threshold = i - 1;
 unlock:
 	rcu_read_unlock();
 }
 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
 {
 	while (memcg) {
 		__mem_cgroup_threshold(memcg, false);
 		if (do_swap_account)
 			__mem_cgroup_threshold(memcg, true);
 		memcg = parent_mem_cgroup(memcg);
 	}
 }
 static int compare_thresholds(const void *a, const void *b)
 {
 	const struct mem_cgroup_threshold *_a = a;
 	const struct mem_cgroup_threshold *_b = b;
 	if (_a->threshold > _b->threshold)
 		return 1;
 	if (_a->threshold < _b->threshold)
 		return -1;
 	return 0;
 }
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_eventfd_list *ev;
 	spin_lock(&memcg_oom_lock);
 	list_for_each_entry(ev, &memcg->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 	spin_unlock(&memcg_oom_lock);
 	return 0;
 }
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
 	for_each_mem_cgroup_tree(iter, memcg)
 		mem_cgroup_oom_notify_cb(iter);
 }
 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 {
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	unsigned long threshold;
 	unsigned long usage;
 	int i, size, ret;
 	ret = page_counter_memparse(args, &threshold);
 	if (ret)
 		return ret;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM) {
 		thresholds = &memcg->thresholds;
 		usage = mem_cgroup_usage(memcg, false);
 	} else if (type == _MEMSWAP) {
 		thresholds = &memcg->memsw_thresholds;
 		usage = mem_cgroup_usage(memcg, true);
 	} else
 		BUG();
 	/* Check if a threshold crossed before adding a new one */
 	if (thresholds->primary)
 		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
 	/* Allocate memory for new array of thresholds */
 	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
 			GFP_KERNEL);
 	if (!new) {
 		ret = -ENOMEM;
 		goto unlock;
 	}
 	new->size = size;
 	/* Copy thresholds (if any) to new array */
 	if (thresholds->primary) {
 		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
 				sizeof(struct mem_cgroup_threshold));
 	}
 	/* Add new threshold */
 	new->entries[size - 1].eventfd = eventfd;
 	new->entries[size - 1].threshold = threshold;
 	/* Sort thresholds. Registering of new threshold isn't time-critical */
 	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
 			compare_thresholds, NULL);
 	/* Find current threshold */
 	new->current_threshold = -1;
 	for (i = 0; i < size; i++) {
 		if (new->entries[i].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used until
 			 * rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		} else
 			break;
 	}
 	/* Free old spare buffer and save old primary buffer as spare */
 	kfree(thresholds->spare);
 	thresholds->spare = thresholds->primary;
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 	return ret;
 }
 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
 }
 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
 	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
 }
 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, enum res_type type)
 {
 	struct mem_cgroup_thresholds *thresholds;
 	struct mem_cgroup_threshold_ary *new;
 	unsigned long usage;
 	int i, j, size;
 	mutex_lock(&memcg->thresholds_lock);
 	if (type == _MEM) {
 		thresholds = &memcg->thresholds;
 		usage = mem_cgroup_usage(memcg, false);
 	} else if (type == _MEMSWAP) {
 		thresholds = &memcg->memsw_thresholds;
 		usage = mem_cgroup_usage(memcg, true);
 	} else
 		BUG();
 	if (!thresholds->primary)
 		goto unlock;
 	/* Check if a threshold crossed before removing */
 	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
 	/* Calculate new number of threshold */
 	size = 0;
 	for (i = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd != eventfd)
 			size++;
 	}
 	new = thresholds->spare;
 	/* Set thresholds array to NULL if we don't have thresholds */
 	if (!size) {
 		kfree(new);
 		new = NULL;
 		goto swap_buffers;
 	}
 	new->size = size;
 	/* Copy thresholds and find current threshold */
 	new->current_threshold = -1;
 	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
 		if (thresholds->primary->entries[i].eventfd == eventfd)
 			continue;
 		new->entries[j] = thresholds->primary->entries[i];
 		if (new->entries[j].threshold <= usage) {
 			/*
 			 * new->current_threshold will not be used
 			 * until rcu_assign_pointer(), so it's safe to increment
 			 * it here.
 			 */
 			++new->current_threshold;
 		}
 		j++;
 	}
 swap_buffers:
 	/* Swap primary and spare array */
 	thresholds->spare = thresholds->primary;
 	/* If all events are unregistered, free the spare array */
 	if (!new) {
 		kfree(thresholds->spare);
 		thresholds->spare = NULL;
 	}
 	rcu_assign_pointer(thresholds->primary, new);
 	/* To be sure that nobody uses thresholds */
 	synchronize_rcu();
 unlock:
 	mutex_unlock(&memcg->thresholds_lock);
 }
 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
 }
 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
 	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
 }
 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd, const char *args)
 {
 	struct mem_cgroup_eventfd_list *event;
 	event = kmalloc(sizeof(*event),	GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	spin_lock(&memcg_oom_lock);
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
 	/* already in OOM ? */
 	if (atomic_read(&memcg->under_oom))
 		eventfd_signal(eventfd, 1);
 	spin_unlock(&memcg_oom_lock);
 	return 0;
 }
 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
 	struct eventfd_ctx *eventfd)
 {
 	struct mem_cgroup_eventfd_list *ev, *tmp;
 	spin_lock(&memcg_oom_lock);
 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
 			list_del(&ev->list);
 			kfree(ev);
 		}
 	}
 	spin_unlock(&memcg_oom_lock);
 }
 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
 	seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 	seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
 	return 0;
 }
 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 	struct cftype *cft, u64 val)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	/* cannot set to root cgroup and only 0 and 1 are allowed */
 	if (!css->parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 	memcg->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(memcg);
 	return 0;
 }
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	int ret;
 	memcg->kmemcg_id = -1;
 	ret = memcg_propagate_kmem(memcg);
 	if (ret)
 		return ret;
 	return mem_cgroup_sockets_init(memcg, ss);
 }
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 	mem_cgroup_sockets_destroy(memcg);
 }
 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 {
 	if (!memcg_kmem_is_active(memcg))
 		return;
 	/*
 	 * kmem charges can outlive the cgroup. In the case of slab
 	 * pages, for instance, a page contain objects from various
 	 * processes. As we prevent from taking a reference for every
 	 * such allocation we have to be careful when doing uncharge
 	 * (see memcg_uncharge_kmem) and here during offlining.
 	 *
 	 * The idea is that that only the _last_ uncharge which sees
 	 * the dead memcg will drop the last reference. An additional
 	 * reference is taken here before the group is marked dead
 	 * which is then paired with css_put during uncharge resp. here.
 	 *
 	 * Although this might sound strange as this path is called from
 	 * css_offline() when the referencemight have dropped down to 0 and
 	 * shouldn't be incremented anymore (css_tryget_online() would
 	 * fail) we do not have other options because of the kmem
 	 * allocations lifetime.
 	 */
 	css_get(&memcg->css);
 	memcg_kmem_mark_dead(memcg);
 	if (page_counter_read(&memcg->kmem))
 		return;
 	if (memcg_kmem_test_and_clear_dead(memcg))
 		css_put(&memcg->css);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	return 0;
 }
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 {
 }
 #endif
 /*
  * DO NOT USE IN NEW FILES.
  *
  * "cgroup.event_control" implementation.
  *
  * This is way over-engineered.  It tries to support fully configurable
  * events for each user.  Such level of flexibility is completely
  * unnecessary especially in the light of the planned unified hierarchy.
  *
  * Please deprecate this and replace with something simpler if at all
  * possible.
  */
 /*
  * Unregister event and free resources.
  *
  * Gets called from workqueue.
  */
 static void memcg_event_remove(struct work_struct *work)
 {
 	struct mem_cgroup_event *event =
 		container_of(work, struct mem_cgroup_event, remove);
 	struct mem_cgroup *memcg = event->memcg;
 	remove_wait_queue(event->wqh, &event->wait);
 	event->unregister_event(memcg, event->eventfd);
 	/* Notify userspace the event is going away. */
 	eventfd_signal(event->eventfd, 1);
 	eventfd_ctx_put(event->eventfd);
 	kfree(event);
 	css_put(&memcg->css);
 }
 /*
  * Gets called on POLLHUP on eventfd when user closes it.
  *
  * Called with wqh->lock held and interrupts disabled.
  */
 static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
 			    int sync, void *key)
 {
 	struct mem_cgroup_event *event =
 		container_of(wait, struct mem_cgroup_event, wait);
 	struct mem_cgroup *memcg = event->memcg;
 	unsigned long flags = (unsigned long)key;
 	if (flags & POLLHUP) {
 		/*
 		 * If the event has been detached at cgroup removal, we
 		 * can simply return knowing the other side will cleanup
 		 * for us.
 		 *
 		 * We can't race against event freeing since the other
 		 * side will require wqh->lock via remove_wait_queue(),
 		 * which we hold.
 		 */
 		spin_lock(&memcg->event_list_lock);
 		if (!list_empty(&event->list)) {
 			list_del_init(&event->list);
 			/*
 			 * We are in atomic context, but cgroup_event_remove()
 			 * may sleep, so we have to call it in workqueue.
 			 */
 			schedule_work(&event->remove);
 		}
 		spin_unlock(&memcg->event_list_lock);
 	}
 	return 0;
 }
 static void memcg_event_ptable_queue_proc(struct file *file,
 		wait_queue_head_t *wqh, poll_table *pt)
 {
 	struct mem_cgroup_event *event =
 		container_of(pt, struct mem_cgroup_event, pt);
 	event->wqh = wqh;
 	add_wait_queue(wqh, &event->wait);
 }
 /*
  * DO NOT USE IN NEW FILES.
  *
  * Parse input and register new cgroup event handler.
  *
  * Input must be in format '<event_fd> <control_fd> <args>'.
  * Interpretation of args is defined by control file implementation.
  */
 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 					 char *buf, size_t nbytes, loff_t off)
 {
 	struct cgroup_subsys_state *css = of_css(of);
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event;
 	struct cgroup_subsys_state *cfile_css;
 	unsigned int efd, cfd;
 	struct fd efile;
 	struct fd cfile;
 	const char *name;
 	char *endp;
 	int ret;
 	buf = strstrip(buf);
 	efd = simple_strtoul(buf, &endp, 10);
 	if (*endp != ' ')
 		return -EINVAL;
 	buf = endp + 1;
 	cfd = simple_strtoul(buf, &endp, 10);
 	if ((*endp != ' ') && (*endp != '\0'))
 		return -EINVAL;
 	buf = endp + 1;
 	event = kzalloc(sizeof(*event), GFP_KERNEL);
 	if (!event)
 		return -ENOMEM;
 	event->memcg = memcg;
 	INIT_LIST_HEAD(&event->list);
 	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
 	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
 	INIT_WORK(&event->remove, memcg_event_remove);
 	efile = fdget(efd);
 	if (!efile.file) {
 		ret = -EBADF;
 		goto out_kfree;
 	}
 	event->eventfd = eventfd_ctx_fileget(efile.file);
 	if (IS_ERR(event->eventfd)) {
 		ret = PTR_ERR(event->eventfd);
 		goto out_put_efile;
 	}
 	cfile = fdget(cfd);
 	if (!cfile.file) {
 		ret = -EBADF;
 		goto out_put_eventfd;
 	}
 	/* the process need read permission on control file */
 	/* AV: shouldn't we check that it's been opened for read instead? */
 	ret = inode_permission(file_inode(cfile.file), MAY_READ);
 	if (ret < 0)
 		goto out_put_cfile;
 	/*
 	 * Determine the event callbacks and set them in @event.  This used
 	 * to be done via struct cftype but cgroup core no longer knows
 	 * about these events.  The following is crude but the whole thing
 	 * is for compatibility anyway.
 	 *
 	 * DO NOT ADD NEW FILES.
 	 */
 	name = cfile.file->f_dentry->d_name.name;
 	if (!strcmp(name, "memory.usage_in_bytes")) {
 		event->register_event = mem_cgroup_usage_register_event;
 		event->unregister_event = mem_cgroup_usage_unregister_event;
 	} else if (!strcmp(name, "memory.oom_control")) {
 		event->register_event = mem_cgroup_oom_register_event;
 		event->unregister_event = mem_cgroup_oom_unregister_event;
 	} else if (!strcmp(name, "memory.pressure_level")) {
 		event->register_event = vmpressure_register_event;
 		event->unregister_event = vmpressure_unregister_event;
 	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
 		event->register_event = memsw_cgroup_usage_register_event;
 		event->unregister_event = memsw_cgroup_usage_unregister_event;
 	} else {
 		ret = -EINVAL;
 		goto out_put_cfile;
 	}
 	/*
 	 * Verify @cfile should belong to @css.  Also, remaining events are
 	 * automatically removed on cgroup destruction but the removal is
 	 * asynchronous, so take an extra ref on @css.
 	 */
 	cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
 					       &memory_cgrp_subsys);
 	ret = -EINVAL;
 	if (IS_ERR(cfile_css))
 		goto out_put_cfile;
 	if (cfile_css != css) {
 		css_put(cfile_css);
 		goto out_put_cfile;
 	}
 	ret = event->register_event(memcg, event->eventfd, buf);
 	if (ret)
 		goto out_put_css;
 	efile.file->f_op->poll(efile.file, &event->pt);
 	spin_lock(&memcg->event_list_lock);
 	list_add(&event->list, &memcg->event_list);
 	spin_unlock(&memcg->event_list_lock);
 	fdput(cfile);
 	fdput(efile);
 	return nbytes;
 out_put_css:
 	css_put(css);
 out_put_cfile:
 	fdput(cfile);
 out_put_eventfd:
 	eventfd_ctx_put(event->eventfd);
 out_put_efile:
 	fdput(efile);
 out_kfree:
 	kfree(event);
 	return ret;
 }
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
 		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "soft_limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
 		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "stat",
 		.seq_show = memcg_stat_show,
 	},
 	{
 		.name = "force_empty",
 		.write = mem_cgroup_force_empty_write,
 	},
 	{
 		.name = "use_hierarchy",
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
 	{
 		.name = "cgroup.event_control",		/* XXX: for compat */
 		.write = memcg_write_event_control,
 		.flags = CFTYPE_NO_PREFIX,
 		.mode = S_IWUGO,
 	},
 	{
 		.name = "swappiness",
 		.read_u64 = mem_cgroup_swappiness_read,
 		.write_u64 = mem_cgroup_swappiness_write,
 	},
 	{
 		.name = "move_charge_at_immigrate",
 		.read_u64 = mem_cgroup_move_charge_read,
 		.write_u64 = mem_cgroup_move_charge_write,
 	},
 	{
 		.name = "oom_control",
 		.seq_show = mem_cgroup_oom_control_read,
 		.write_u64 = mem_cgroup_oom_control_write,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 	{
 		.name = "pressure_level",
 	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
 		.seq_show = memcg_numa_stat_show,
 	},
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 	{
 		.name = "kmem.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
 		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.failcnt",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "kmem.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 #ifdef CONFIG_SLABINFO
 	{
 		.name = "kmem.slabinfo",
 		.seq_show = mem_cgroup_slabinfo_read,
 	},
 #endif
 #endif
 	{ },	/* terminate */
 };
 #ifdef CONFIG_MEMCG_SWAP
 static struct cftype memsw_cgroup_files[] = {
 	{
 		.name = "memsw.usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.max_usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.limit_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
 		.write = mem_cgroup_write,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{
 		.name = "memsw.failcnt",
 		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
 		.write = mem_cgroup_reset,
 		.read_u64 = mem_cgroup_read_u64,
 	},
 	{ },	/* terminate */
 };
 #endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup_per_zone *mz;
 	int zone, tmp = node;
 	/*
 	 * This routine is called against possible nodes.
 	 * But it's BUG to call kmalloc() against offline node.
 	 *
 	 * TODO: this routine can waste much memory for nodes which will
 	 *       never be onlined. It's better to use memory hotplug callback
 	 *       function.
 	 */
 	if (!node_state(node, N_NORMAL_MEMORY))
 		tmp = -1;
 	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
 	if (!pn)
 		return 1;
 	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 		mz = &pn->zoneinfo[zone];
 		lruvec_init(&mz->lruvec);
 		mz->usage_in_excess = 0;
 		mz->on_tree = false;
 		mz->memcg = memcg;
 	}
 	memcg->nodeinfo[node] = pn;
 	return 0;
 }
 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
 	kfree(memcg->nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
 	size_t size;
 	size = sizeof(struct mem_cgroup);
 	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
 	memcg = kzalloc(size, GFP_KERNEL);
 	if (!memcg)
 		return NULL;
 	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
 	if (!memcg->stat)
 		goto out_free;
 	spin_lock_init(&memcg->pcp_counter_lock);
 	return memcg;
 out_free:
 	kfree(memcg);
 	return NULL;
 }
 /*
  * At destroying mem_cgroup, references from swap_cgroup can remain.
  * (scanning all at force_empty is too costly...)
  *
  * Instead of clearing all references at force_empty, we remember
  * the number of reference from swap_cgroup and free mem_cgroup when
  * it goes down to 0.
  *
  * Removal of cgroup itself succeeds regardless of refs from swap.
  */
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
 	mem_cgroup_remove_from_trees(memcg);
 	for_each_node(node)
 		free_mem_cgroup_per_zone_info(memcg, node);
 	free_percpu(memcg->stat);
 	/*
 	 * We need to make sure that (at least for now), the jump label
 	 * destruction code runs outside of the cgroup lock. This is because
 	 * get_online_cpus(), which is called from the static_branch update,
 	 * can't be called inside the cgroup_lock. cpusets are the ones
 	 * enforcing this dependency, so if they ever change, we might as well.
 	 *
 	 * schedule_work() will guarantee this happens. Be careful if you need
 	 * to move this code around, and make sure it is outside
 	 * the cgroup_lock.
 	 */
 	disarm_static_keys(memcg);
 	kfree(memcg);
 }
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
 	if (!memcg->memory.parent)
 		return NULL;
 	return mem_cgroup_from_counter(memcg->memory.parent, memory);
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 static void __init mem_cgroup_soft_limit_tree_init(void)
 {
 	struct mem_cgroup_tree_per_node *rtpn;
 	struct mem_cgroup_tree_per_zone *rtpz;
 	int tmp, node, zone;
 	for_each_node(node) {
 		tmp = node;
 		if (!node_state(node, N_NORMAL_MEMORY))
 			tmp = -1;
 		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
 		BUG_ON(!rtpn);
 		soft_limit_tree.rb_tree_per_node[node] = rtpn;
 		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
 			rtpz = &rtpn->rb_tree_per_zone[zone];
 			rtpz->rb_root = RB_ROOT;
 			spin_lock_init(&rtpz->lock);
 		}
 	}
 }
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct mem_cgroup *memcg;
 	long error = -ENOMEM;
 	int node;
 	memcg = mem_cgroup_alloc();
 	if (!memcg)
 		return ERR_PTR(error);
 	for_each_node(node)
 		if (alloc_mem_cgroup_per_zone_info(memcg, node))
 			goto free_out;
 	/* root ? */
 	if (parent_css == NULL) {
 		root_mem_cgroup = memcg;
 		page_counter_init(&memcg->memory, NULL);
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
 	}
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
 	INIT_LIST_HEAD(&memcg->event_list);
 	spin_lock_init(&memcg->event_list_lock);
 	return &memcg->css;
 free_out:
 	__mem_cgroup_free(memcg);
 	return ERR_PTR(error);
 }
 static int
 mem_cgroup_css_online(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
 	int ret;
 	if (css->id > MEM_CGROUP_ID_MAX)
 		return -ENOSPC;
 	if (!parent)
 		return 0;
 	mutex_lock(&memcg_create_mutex);
 	memcg->use_hierarchy = parent->use_hierarchy;
 	memcg->oom_kill_disable = parent->oom_kill_disable;
 	memcg->swappiness = mem_cgroup_swappiness(parent);
 	if (parent->use_hierarchy) {
 		page_counter_init(&memcg->memory, &parent->memory);
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
 		/*
 		 * No need to take a reference to the parent because cgroup
 		 * core guarantees its existence.
 		 */
 	} else {
 		page_counter_init(&memcg->memory, NULL);
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
 		 * unfortunate state in our controller.
 		 */
 		if (parent != root_mem_cgroup)
 			memory_cgrp_subsys.broken_hierarchy = true;
 	}
 	mutex_unlock(&memcg_create_mutex);
 	ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
 	if (ret)
 		return ret;
 	/*
 	 * Make sure the memcg is initialized: mem_cgroup_iter()
 	 * orders reading memcg->initialized against its callers
 	 * reading the memcg members.
 	 */
 	smp_store_release(&memcg->initialized, 1);
 	return 0;
 }
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event, *tmp;
 	struct cgroup_subsys_state *iter;
 	/*
 	 * Unregister events and notify userspace.
 	 * Notify userspace about cgroup removing only after rmdir of cgroup
 	 * directory to avoid race between userspace and kernelspace.
 	 */
 	spin_lock(&memcg->event_list_lock);
 	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
 		list_del_init(&event->list);
 		schedule_work(&event->remove);
 	}
 	spin_unlock(&memcg->event_list_lock);
 	kmem_cgroup_css_offline(memcg);
 	/*
 	 * This requires that offlining is serialized.  Right now that is
 	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
 	 */
 	css_for_each_descendant_post(iter, css)
 		mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
 	memcg_unregister_all_caches(memcg);
 	vmpressure_cleanup(&memcg->vmpressure);
 }
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	/*
 	 * XXX: css_offline() would be where we should reparent all
 	 * memory to prepare the cgroup for destruction.  However,
 	 * memcg does not do css_tryget_online() and page_counter charging
 	 * under the same RCU lock region, which means that charging
 	 * could race with offlining.  Offlining only happens to
 	 * cgroups with no tasks in them but charges can show up
 	 * without any tasks from the swapin path when the target
 	 * memcg is looked up from the swapout record and not from the
 	 * current task as it usually is.  A race like this can leak
 	 * charges and put pages with stale cgroup pointers into
 	 * circulation:
 	 *
 	 * #0                        #1
 	 *                           lookup_swap_cgroup_id()
 	 *                           rcu_read_lock()
 	 *                           mem_cgroup_lookup()
 	 *                           css_tryget_online()
 	 *                           rcu_read_unlock()
 	 * disable css_tryget_online()
 	 * call_rcu()
 	 *   offline_css()
 	 *     reparent_charges()
 	 *                           page_counter_try_charge()
 	 *                           css_put()
 	 *                             css_free()
 	 *                           pc->mem_cgroup = dead memcg
 	 *                           add page to lru
 	 *
 	 * The bulk of the charges are still moved in offline_css() to
 	 * avoid pinning a lot of pages in case a long-term reference
 	 * like a swapout record is deferring the css_free() to long
 	 * after offlining.  But this makes sure we catch any charges
 	 * made after offlining:
 	 */
 	mem_cgroup_reparent_charges(memcg);
 	memcg_destroy_kmem(memcg);
 	__mem_cgroup_free(memcg);
 }
 /**
  * mem_cgroup_css_reset - reset the states of a mem_cgroup
  * @css: the target css
  *
  * Reset the states of the mem_cgroup associated with @css.  This is
  * invoked when the userland requests disabling on the default hierarchy
  * but the memcg is pinned through dependency.  The memcg should stop
  * applying policies and should revert to the vanilla state as it may be
  * made visible again.
  *
  * The current implementation only resets the essential configurations.
  * This needs to be expanded to cover all the visible parts.
  */
 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
 	mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
 	memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
 	memcg->soft_limit = 0;
 }
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret;
 	/* Try a single bulk charge without reclaim first */
 	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
 	if (!ret) {
 		mc.precharge += count;
 		return ret;
 	}
 	if (ret == -EINTR) {
 		cancel_charge(root_mem_cgroup, count);
 		return ret;
 	}
 	/* Try charges one by one with reclaim */
 	while (count--) {
 		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
 		/*
 		 * In case of failure, any residual charges against
 		 * mc.to will be dropped by mem_cgroup_clear_mc()
 		 * later on.  However, cancel any charges that are
 		 * bypassed to root right away or they'll be lost.
 		 */
 		if (ret == -EINTR)
 			cancel_charge(root_mem_cgroup, 1);
 		if (ret)
 			return ret;
 		mc.precharge++;
 		cond_resched();
 	}
 	return 0;
 }
 /**
  * get_mctgt_type - get target type of moving charge
  * @vma: the vma the pte to be checked belongs
  * @addr: the address corresponding to the pte to be checked
  * @ptent: the pte to be checked
  * @target: the pointer the target page or swap ent will be stored(can be NULL)
  *
  * Returns
  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
  *     move charge. if @target is not NULL, the page is stored in target->page
  *     with extra refcnt got(Callers should handle it).
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  *     target for charge migration. if @target is not NULL, the entry is stored
  *     in target->ent.
  *
  * Called with pte lock held.
  */
 union mc_target {
 	struct page	*page;
 	swp_entry_t	ent;
 };
 enum mc_target_type {
 	MC_TARGET_NONE = 0,
 	MC_TARGET_PAGE,
 	MC_TARGET_SWAP,
 };
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
 						unsigned long addr, pte_t ptent)
 {
 	struct page *page = vm_normal_page(vma, addr, ptent);
 	if (!page || !page_mapped(page))
 		return NULL;
 	if (PageAnon(page)) {
 		/* we don't move shared anon */
 		if (!move_anon())
 			return NULL;
 	} else if (!move_file())
 		/* we ignore mapcount for file pages */
 		return NULL;
 	if (!get_page_unless_zero(page))
 		return NULL;
 	return page;
 }
 #ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	swp_entry_t ent = pte_to_swp_entry(ptent);
 	if (!move_anon() || non_swap_entry(ent))
 		return NULL;
 	/*
 	 * Because lookup_swap_cache() updates some statistics counter,
 	 * we call find_get_page() with swapper_space directly.
 	 */
 	page = find_get_page(swap_address_space(ent), ent.val);
 	if (do_swap_account)
 		entry->val = ent.val;
 	return page;
 }
 #else
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	return NULL;
 }
 #endif
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 			unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
 	struct page *page = NULL;
 	struct address_space *mapping;
 	pgoff_t pgoff;
 	if (!vma->vm_file) /* anonymous vma */
 		return NULL;
 	if (!move_file())
 		return NULL;
 	mapping = vma->vm_file->f_mapping;
 	if (pte_none(ptent))
 		pgoff = linear_page_index(vma, addr);
 	else /* pte_file(ptent) is true */
 		pgoff = pte_to_pgoff(ptent);
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 #ifdef CONFIG_SWAP
 	/* shmem/tmpfs may report page out on swap: account for that too. */
 	if (shmem_mapping(mapping)) {
 		page = find_get_entry(mapping, pgoff);
 		if (radix_tree_exceptional_entry(page)) {
 			swp_entry_t swp = radix_to_swp_entry(page);
 			if (do_swap_account)
 				*entry = swp;
 			page = find_get_page(swap_address_space(swp), swp.val);
 		}
 	} else
 		page = find_get_page(mapping, pgoff);
 #else
 	page = find_get_page(mapping, pgoff);
 #endif
 	return page;
 }
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	swp_entry_t ent = { .val = 0 };
 	if (pte_present(ptent))
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
 	else if (pte_none(ptent) || pte_file(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 	if (!page && !ent.val)
 		return ret;
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
 		 * Do only loose check w/o serialization.
 		 * mem_cgroup_move_account() checks the pc is valid or
 		 * not under LRU exclusion.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
 			if (target)
 				target->page = page;
 		}
 		if (!ret || !target)
 			put_page(page);
 	}
 	/* There is a swap entry and a page doesn't exist or isn't charged */
 	if (ent.val && !ret &&
 	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
 		ret = MC_TARGET_SWAP;
 		if (target)
 			target->ent = ent;
 	}
 	return ret;
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /*
  * We don't consider swapping or file mapped pages because THP does not
  * support them for now.
  * Caller should make sure that pmd_trans_huge(pmd) is true.
  */
 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	struct page *page = NULL;
 	struct page_cgroup *pc;
 	enum mc_target_type ret = MC_TARGET_NONE;
 	page = pmd_page(pmd);
 	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
 	if (!move_anon())
 		return ret;
 	pc = lookup_page_cgroup(page);
 	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 		ret = MC_TARGET_PAGE;
 		if (target) {
 			get_page(page);
 			target->page = page;
 		}
 	}
 	return ret;
 }
 #else
 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t pmd, union mc_target *target)
 {
 	return MC_TARGET_NONE;
 }
 #endif
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 					unsigned long addr, unsigned long end,
 					struct mm_walk *walk)
 {
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
 			mc.precharge += HPAGE_PMD_NR;
 		spin_unlock(ptl);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
 		if (get_mctgt_type(vma, addr, *pte, NULL))
 			mc.precharge++;	/* increment precharge temporarily */
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	return 0;
 }
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
 	unsigned long precharge;
 	struct vm_area_struct *vma;
 	down_read(&mm->mmap_sem);
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		struct mm_walk mem_cgroup_count_precharge_walk = {
 			.pmd_entry = mem_cgroup_count_precharge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		walk_page_range(vma->vm_start, vma->vm_end,
 					&mem_cgroup_count_precharge_walk);
 	}
 	up_read(&mm->mmap_sem);
 	precharge = mc.precharge;
 	mc.precharge = 0;
 	return precharge;
 }
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
 	unsigned long precharge = mem_cgroup_count_precharge(mm);
 	VM_BUG_ON(mc.moving_task);
 	mc.moving_task = current;
 	return mem_cgroup_do_precharge(precharge);
 }
 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
 static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
-	int i;
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
 		cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
 	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
 		cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
 	if (mc.moved_swap) {
 		/* uncharge swap account from the old cgroup */
 		if (!mem_cgroup_is_root(mc.from))
 			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
 		/*
 		 * we charged both to->memory and to->memsw, so we
 		 * should uncharge to->memory.
 		 */
 		if (!mem_cgroup_is_root(mc.to))
 			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
-		for (i = 0; i < mc.moved_swap; i++)
+		css_put_many(&mc.from->css, mc.moved_swap);
-			css_put(&mc.from->css);
 		/* we've already done css_get(mc.to) */
 		mc.moved_swap = 0;
 	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
 }
 static void mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	/*
 	 * we must clear moving_task before waking up waiters at the end of
 	 * task migration.
 	 */
 	mc.moving_task = NULL;
 	__mem_cgroup_clear_mc();
 	spin_lock(&mc.lock);
 	mc.from = NULL;
 	mc.to = NULL;
 	spin_unlock(&mc.lock);
 	mem_cgroup_end_move(from);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	int ret = 0;
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	unsigned long move_charge_at_immigrate;
 	/*
 	 * We are now commited to this value whatever it is. Changes in this
 	 * tunable will only affect upcoming migrations, not the current one.
 	 * So we need to save it, and keep it going.
 	 */
 	move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
 	if (move_charge_at_immigrate) {
 		struct mm_struct *mm;
 		struct mem_cgroup *from = mem_cgroup_from_task(p);
 		VM_BUG_ON(from == memcg);
 		mm = get_task_mm(p);
 		if (!mm)
 			return 0;
 		/* We move charges only when we move a owner of the mm */
 		if (mm->owner == p) {
 			VM_BUG_ON(mc.from);
 			VM_BUG_ON(mc.to);
 			VM_BUG_ON(mc.precharge);
 			VM_BUG_ON(mc.moved_charge);
 			VM_BUG_ON(mc.moved_swap);
 			mem_cgroup_start_move(from);
 			spin_lock(&mc.lock);
 			mc.from = from;
 			mc.to = memcg;
 			mc.immigrate_flags = move_charge_at_immigrate;
 			spin_unlock(&mc.lock);
 			/* We set mc.moving_task later */
 			ret = mem_cgroup_precharge_mc(mm);
 			if (ret)
 				mem_cgroup_clear_mc();
 		}
 		mmput(mm);
 	}
 	return ret;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
 				     struct cgroup_taskset *tset)
 {
 	mem_cgroup_clear_mc();
 }
 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct mm_walk *walk)
 {
 	int ret = 0;
 	struct vm_area_struct *vma = walk->private;
 	pte_t *pte;
 	spinlock_t *ptl;
 	enum mc_target_type target_type;
 	union mc_target target;
 	struct page *page;
 	struct page_cgroup *pc;
 	/*
 	 * We don't take compound_lock() here but no race with splitting thp
 	 * happens because:
 	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
 	 *    under splitting, which means there's no concurrent thp split,
 	 *  - if another thread runs into split_huge_page() just after we
 	 *    entered this if-block, the thread must wait for page table lock
 	 *    to be unlocked in __split_huge_page_splitting(), where the main
 	 *    part of thp split is not executed yet.
 	 */
 	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
 		if (mc.precharge < HPAGE_PMD_NR) {
 			spin_unlock(ptl);
 			return 0;
 		}
 		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
 		if (target_type == MC_TARGET_PAGE) {
 			page = target.page;
 			if (!isolate_lru_page(page)) {
 				pc = lookup_page_cgroup(page);
 				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
 							pc, mc.from, mc.to)) {
 					mc.precharge -= HPAGE_PMD_NR;
 					mc.moved_charge += HPAGE_PMD_NR;
 				}
 				putback_lru_page(page);
 			}
 			put_page(page);
 		}
 		spin_unlock(ptl);
 		return 0;
 	}
 	if (pmd_trans_unstable(pmd))
 		return 0;
 retry:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
 		pte_t ptent = *(pte++);
 		swp_entry_t ent;
 		if (!mc.precharge)
 			break;
 		switch (get_mctgt_type(vma, addr, ptent, &target)) {
 		case MC_TARGET_PAGE:
 			page = target.page;
 			if (isolate_lru_page(page))
 				goto put;
 			pc = lookup_page_cgroup(page);
 			if (!mem_cgroup_move_account(page, 1, pc,
 						     mc.from, mc.to)) {
 				mc.precharge--;
 				/* we uncharge from mc.from later. */
 				mc.moved_charge++;
 			}
 			putback_lru_page(page);
 put:			/* get_mctgt_type() gets the page */
 			put_page(page);
 			break;
 		case MC_TARGET_SWAP:
 			ent = target.ent;
 			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
 				mc.precharge--;
 				/* we fixup refcnts and charges later. */
 				mc.moved_swap++;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end) {
 		/*
 		 * We have consumed all precharges we got in can_attach().
 		 * We try charge one by one, but don't do any additional
 		 * charges to mc.to if we have failed in charge once in attach()
 		 * phase.
 		 */
 		ret = mem_cgroup_do_precharge(1);
 		if (!ret)
 			goto retry;
 	}
 	return ret;
 }
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
 	lru_add_drain_all();
 retry:
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
 		/*
 		 * Someone who are holding the mmap_sem might be waiting in
 		 * waitq. So we cancel all extra charges, wake up all waiters,
 		 * and retry. Because we cancel precharges, we might not be able
 		 * to move enough charges, but moving charge is a best-effort
 		 * feature anyway, so it wouldn't be a big problem.
 		 */
 		__mem_cgroup_clear_mc();
 		cond_resched();
 		goto retry;
 	}
 	for (vma = mm->mmap; vma; vma = vma->vm_next) {
 		int ret;
 		struct mm_walk mem_cgroup_move_charge_walk = {
 			.pmd_entry = mem_cgroup_move_charge_pte_range,
 			.mm = mm,
 			.private = vma,
 		};
 		if (is_vm_hugetlb_page(vma))
 			continue;
 		ret = walk_page_range(vma->vm_start, vma->vm_end,
 						&mem_cgroup_move_charge_walk);
 		if (ret)
 			/*
 			 * means we have consumed all precharges and failed in
 			 * doing additional charge. Just abandon here.
 			 */
 			break;
 	}
 	up_read(&mm->mmap_sem);
 }
 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	struct task_struct *p = cgroup_taskset_first(tset);
 	struct mm_struct *mm = get_task_mm(p);
 	if (mm) {
 		if (mc.to)
 			mem_cgroup_move_charge(mm);
 		mmput(mm);
 	}
 	if (mc.to)
 		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 	return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
 				     struct cgroup_taskset *tset)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
 				 struct cgroup_taskset *tset)
 {
 }
 #endif
 /*
  * Cgroup retains root cgroups across [un]mount cycles making it necessary
  * to verify whether we're attached to the default hierarchy on each mount
  * attempt.
  */
 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 {
 	/*
 	 * use_hierarchy is forced on the default hierarchy.  cgroup core
 	 * guarantees that @root doesn't have any children, so turning it
 	 * on for the root memcg is enough.
 	 */
 	if (cgroup_on_dfl(root_css->cgroup))
 		mem_cgroup_from_css(root_css)->use_hierarchy = true;
 }
 struct cgroup_subsys memory_cgrp_subsys = {
 	.css_alloc = mem_cgroup_css_alloc,
 	.css_online = mem_cgroup_css_online,
 	.css_offline = mem_cgroup_css_offline,
 	.css_free = mem_cgroup_css_free,
 	.css_reset = mem_cgroup_css_reset,
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
 	.bind = mem_cgroup_bind,
 	.legacy_cftypes = mem_cgroup_files,
 	.early_init = 0,
 };
 #ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
 	if (!strcmp(s, "1"))
 		really_do_swap_account = 1;
 	else if (!strcmp(s, "0"))
 		really_do_swap_account = 0;
 	return 1;
 }
 __setup("swapaccount=", enable_swap_account);
 static void __init memsw_file_init(void)
 {
 	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
 					  memsw_cgroup_files));
 }
 static void __init enable_swap_cgroup(void)
 {
 	if (!mem_cgroup_disabled() && really_do_swap_account) {
 		do_swap_account = 1;
 		memsw_file_init();
 	}
 }
 #else
 static void __init enable_swap_cgroup(void)
 {
 }
 #endif
 #ifdef CONFIG_MEMCG_SWAP
 /**
  * mem_cgroup_swapout - transfer a memsw charge to swap
  * @page: page whose memsw charge to transfer
  * @entry: swap entry to move the charge to
  *
  * Transfer the memsw charge of @page to @entry.
  */
 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
 	struct page_cgroup *pc;
 	unsigned short oldid;
 	VM_BUG_ON_PAGE(PageLRU(page), page);
 	VM_BUG_ON_PAGE(page_count(page), page);
 	if (!do_swap_account)
 		return;
 	pc = lookup_page_cgroup(page);
 	/* Readahead page, never charged */
 	if (!PageCgroupUsed(pc))
 		return;
 	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
 	oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
 	VM_BUG_ON_PAGE(oldid, page);
 	pc->flags &= ~PCG_MEMSW;
 	css_get(&pc->mem_cgroup->css);
 	mem_cgroup_swap_statistics(pc->mem_cgroup, true);
 }
 /**
  * mem_cgroup_uncharge_swap - uncharge a swap entry
  * @entry: swap entry to uncharge
  *
  * Drop the memsw charge associated with @entry.
  */
 void mem_cgroup_uncharge_swap(swp_entry_t entry)
 {
 	struct mem_cgroup *memcg;
 	unsigned short id;
 	if (!do_swap_account)
 		return;
 	id = swap_cgroup_record(entry, 0);
 	rcu_read_lock();
 	memcg = mem_cgroup_lookup(id);
 	if (memcg) {
 		if (!mem_cgroup_is_root(memcg))
 			page_counter_uncharge(&memcg->memsw, 1);
 		mem_cgroup_swap_statistics(memcg, false);
 		css_put(&memcg->css);
 	}
 	rcu_read_unlock();
 }
 #endif
 /**
  * mem_cgroup_try_charge - try charging a page
  * @page: page to charge
  * @mm: mm context of the victim
  * @gfp_mask: reclaim mode
  * @memcgp: charged memcg return
  *
  * Try to charge @page to the memcg that @mm belongs to, reclaiming
  * pages according to @gfp_mask if necessary.
  *
  * Returns 0 on success, with *@memcgp pointing to the charged memcg.
  * Otherwise, an error code is returned.
  *
  * After page->mapping has been set up, the caller must finalize the
  * charge with mem_cgroup_commit_charge().  Or abort the transaction
  * with mem_cgroup_cancel_charge() in case page instantiation fails.
  */
 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
 			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned int nr_pages = 1;
 	int ret = 0;
 	if (mem_cgroup_disabled())
 		goto out;
 	if (PageSwapCache(page)) {
 		struct page_cgroup *pc = lookup_page_cgroup(page);
 		/*
 		 * Every swap fault against a single page tries to charge the
 		 * page, bail as early as possible.  shmem_unuse() encounters
 		 * already charged pages, too.  The USED bit is protected by
 		 * the page lock, which serializes swap cache removal, which
 		 * in turn serializes uncharging.
 		 */
 		if (PageCgroupUsed(pc))
 			goto out;
 	}
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 	}
 	if (do_swap_account && PageSwapCache(page))
 		memcg = try_get_mem_cgroup_from_page(page);
 	if (!memcg)
 		memcg = get_mem_cgroup_from_mm(mm);
 	ret = try_charge(memcg, gfp_mask, nr_pages);
 	css_put(&memcg->css);
 	if (ret == -EINTR) {
 		memcg = root_mem_cgroup;
 		ret = 0;
 	}
 out:
 	*memcgp = memcg;
 	return ret;
 }
 /**
  * mem_cgroup_commit_charge - commit a page charge
  * @page: page to charge
  * @memcg: memcg to charge the page to
  * @lrucare: page might be on LRU already
  *
  * Finalize a charge transaction started by mem_cgroup_try_charge(),
  * after page->mapping has been set up.  This must happen atomically
  * as part of the page instantiation, i.e. under the page table lock
  * for anonymous pages, under the page lock for page and swap cache.
  *
  * In addition, the page must not be on the LRU during the commit, to
  * prevent racing with task migration.  If it might be, use @lrucare.
  *
  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
  */
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
 			      bool lrucare)
 {
 	unsigned int nr_pages = 1;
 	VM_BUG_ON_PAGE(!page->mapping, page);
 	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
 	if (mem_cgroup_disabled())
 		return;
 	/*
 	 * Swap faults will attempt to charge the same page multiple
 	 * times.  But reuse_swap_page() might have removed the page
 	 * from swapcache already, so we can't check PageSwapCache().
 	 */
 	if (!memcg)
 		return;
 	commit_charge(page, memcg, lrucare);
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 	}
 	local_irq_disable();
 	mem_cgroup_charge_statistics(memcg, page, nr_pages);
 	memcg_check_events(memcg, page);
 	local_irq_enable();
 	if (do_swap_account && PageSwapCache(page)) {
 		swp_entry_t entry = { .val = page_private(page) };
 		/*
 		 * The swap entry might not get freed for a long time,
 		 * let's not wait for it.  The page already received a
 		 * memory+swap charge, drop the swap entry duplicate.
 		 */
 		mem_cgroup_uncharge_swap(entry);
 	}
 }
 /**
  * mem_cgroup_cancel_charge - cancel a page charge
  * @page: page to charge
  * @memcg: memcg to charge the page to
  *
  * Cancel a charge transaction started by mem_cgroup_try_charge().
  */
 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
 {
 	unsigned int nr_pages = 1;
 	if (mem_cgroup_disabled())
 		return;
 	/*
 	 * Swap faults will attempt to charge the same page multiple
 	 * times.  But reuse_swap_page() might have removed the page
 	 * from swapcache already, so we can't check PageSwapCache().
 	 */
 	if (!memcg)
 		return;
 	if (PageTransHuge(page)) {
 		nr_pages <<= compound_order(page);
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 	}
 	cancel_charge(memcg, nr_pages);
 }
 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
 			   unsigned long nr_mem, unsigned long nr_memsw,
 			   unsigned long nr_anon, unsigned long nr_file,
 			   unsigned long nr_huge, struct page *dummy_page)
 {
 	unsigned long flags;
 	if (!mem_cgroup_is_root(memcg)) {
 		if (nr_mem)
 			page_counter_uncharge(&memcg->memory, nr_mem);
 		if (nr_memsw)
 			page_counter_uncharge(&memcg->memsw, nr_memsw);
 		memcg_oom_recover(memcg);
 	}
 	local_irq_save(flags);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
 	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
 	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
 	memcg_check_events(memcg, dummy_page);
 	local_irq_restore(flags);
+	if (!mem_cgroup_is_root(memcg))
+		css_put_many(&memcg->css, max(nr_mem, nr_memsw));
 }
 static void uncharge_list(struct list_head *page_list)
 {
 	struct mem_cgroup *memcg = NULL;
 	unsigned long nr_memsw = 0;
 	unsigned long nr_anon = 0;
 	unsigned long nr_file = 0;
 	unsigned long nr_huge = 0;
 	unsigned long pgpgout = 0;
 	unsigned long nr_mem = 0;
 	struct list_head *next;
 	struct page *page;
 	next = page_list->next;
 	do {
 		unsigned int nr_pages = 1;
 		struct page_cgroup *pc;
 		page = list_entry(next, struct page, lru);
 		next = page->lru.next;
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 		VM_BUG_ON_PAGE(page_count(page), page);
 		pc = lookup_page_cgroup(page);
 		if (!PageCgroupUsed(pc))
 			continue;
 		/*
 		 * Nobody should be changing or seriously looking at
 		 * pc->mem_cgroup and pc->flags at this point, we have
 		 * fully exclusive access to the page.
 		 */
 		if (memcg != pc->mem_cgroup) {
 			if (memcg) {
 				uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
 					       nr_anon, nr_file, nr_huge, page);
 				pgpgout = nr_mem = nr_memsw = 0;
 				nr_anon = nr_file = nr_huge = 0;
 			}
 			memcg = pc->mem_cgroup;
 		}
 		if (PageTransHuge(page)) {
 			nr_pages <<= compound_order(page);
 			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 			nr_huge += nr_pages;
 		}
 		if (PageAnon(page))
 			nr_anon += nr_pages;
 		else
 			nr_file += nr_pages;
 		if (pc->flags & PCG_MEM)
 			nr_mem += nr_pages;
 		if (pc->flags & PCG_MEMSW)
 			nr_memsw += nr_pages;
 		pc->flags = 0;
 		pgpgout++;
 	} while (next != page_list);
 	if (memcg)
 		uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
 			       nr_anon, nr_file, nr_huge, page);
 }
 /**
  * mem_cgroup_uncharge - uncharge a page
  * @page: page to uncharge
  *
  * Uncharge a page previously charged with mem_cgroup_try_charge() and
  * mem_cgroup_commit_charge().
  */
 void mem_cgroup_uncharge(struct page *page)
 {
 	struct page_cgroup *pc;
 	if (mem_cgroup_disabled())
 		return;
 	/* Don't touch page->lru of any random page, pre-check: */
 	pc = lookup_page_cgroup(page);
 	if (!PageCgroupUsed(pc))
 		return;
 	INIT_LIST_HEAD(&page->lru);
 	uncharge_list(&page->lru);
 }
 /**
  * mem_cgroup_uncharge_list - uncharge a list of page
  * @page_list: list of pages to uncharge
  *
  * Uncharge a list of pages previously charged with
  * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
  */
 void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 	if (mem_cgroup_disabled())
 		return;
 	if (!list_empty(page_list))
 		uncharge_list(page_list);
 }
 /**
  * mem_cgroup_migrate - migrate a charge to another page
  * @oldpage: currently charged page
  * @newpage: page to transfer the charge to
  * @lrucare: both pages might be on the LRU already
  *
  * Migrate the charge from @oldpage to @newpage.
  *
  * Both pages must be locked, @newpage->mapping must be set up.
  */
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 			bool lrucare)
 {
 	struct page_cgroup *pc;
 	int isolated;
 	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
 	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
 	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
 	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
 	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
 	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
 		       newpage);
 	if (mem_cgroup_disabled())
 		return;
 	/* Page cache replacement: new page already charged? */
 	pc = lookup_page_cgroup(newpage);
 	if (PageCgroupUsed(pc))
 		return;
 	/* Re-entrant migration: old page already uncharged? */
 	pc = lookup_page_cgroup(oldpage);
 	if (!PageCgroupUsed(pc))
 		return;
 	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
 	VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
 	if (lrucare)
 		lock_page_lru(oldpage, &isolated);
 	pc->flags = 0;
 	if (lrucare)
 		unlock_page_lru(oldpage, isolated);
 	commit_charge(newpage, pc->mem_cgroup, lrucare);
 }
 /*
  * subsys_initcall() for memory controller.
  *
  * Some parts like hotcpu_notifier() have to be initialized from this context
  * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
  * everything that doesn't depend on a specific mem_cgroup structure should
  * be initialized from here.
  */
 static int __init mem_cgroup_init(void)
 {
 	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
 	enable_swap_cgroup();
 	mem_cgroup_soft_limit_tree_init();
 	memcg_stock_init();
 	return 0;
 }