Blame view

mm/memcontrol.c 118 KB
8cdea7c05   Balbir Singh   Memory controller...
1
2
3
4
5
  /* memcontrol.c - Memory Controller
   *
   * Copyright IBM Corporation, 2007
   * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   *
78fb74669   Pavel Emelianov   Memory controller...
6
7
8
   * Copyright 2007 OpenVZ SWsoft Inc
   * Author: Pavel Emelianov <xemul@openvz.org>
   *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
9
10
11
12
   * Memory thresholds
   * Copyright (C) 2009 Nokia Corporation
   * Author: Kirill A. Shutemov
   *
8cdea7c05   Balbir Singh   Memory controller...
13
14
15
16
17
18
19
20
21
22
23
24
25
26
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   */
  
  #include <linux/res_counter.h>
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
78fb74669   Pavel Emelianov   Memory controller...
27
  #include <linux/mm.h>
4ffef5fef   Daisuke Nishimura   memcg: move charg...
28
  #include <linux/hugetlb.h>
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
29
  #include <linux/pagemap.h>
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
30
  #include <linux/smp.h>
8a9f3ccd2   Balbir Singh   Memory controller...
31
  #include <linux/page-flags.h>
66e1707bc   Balbir Singh   Memory controller...
32
  #include <linux/backing-dev.h>
8a9f3ccd2   Balbir Singh   Memory controller...
33
34
  #include <linux/bit_spinlock.h>
  #include <linux/rcupdate.h>
e222432bf   Balbir Singh   memcg: show memcg...
35
  #include <linux/limits.h>
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
36
  #include <linux/mutex.h>
f64c3f549   Balbir Singh   memory controller...
37
  #include <linux/rbtree.h>
b6ac57d50   Balbir Singh   memcgroup: move m...
38
  #include <linux/slab.h>
66e1707bc   Balbir Singh   Memory controller...
39
  #include <linux/swap.h>
024914477   Daisuke Nishimura   memcg: move charg...
40
  #include <linux/swapops.h>
66e1707bc   Balbir Singh   Memory controller...
41
  #include <linux/spinlock.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
42
43
  #include <linux/eventfd.h>
  #include <linux/sort.h>
66e1707bc   Balbir Singh   Memory controller...
44
  #include <linux/fs.h>
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
45
  #include <linux/seq_file.h>
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
46
  #include <linux/vmalloc.h>
b69408e88   Christoph Lameter   vmscan: Use an in...
47
  #include <linux/mm_inline.h>
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
48
  #include <linux/page_cgroup.h>
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
49
  #include <linux/cpu.h>
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
50
  #include "internal.h"
8cdea7c05   Balbir Singh   Memory controller...
51

8697d3319   Balbir Singh   Memory controller...
52
  #include <asm/uaccess.h>
a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
53
  struct cgroup_subsys mem_cgroup_subsys __read_mostly;
a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
54
  #define MEM_CGROUP_RECLAIM_RETRIES	5
4b3bde4c9   Balbir Singh   memcg: remove the...
55
  struct mem_cgroup *root_mem_cgroup __read_mostly;
8cdea7c05   Balbir Singh   Memory controller...
56

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
57
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
338c84310   Li Zefan   memcg: remove som...
58
  /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
59
60
61
62
63
  int do_swap_account __read_mostly;
  static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  #else
  #define do_swap_account		(0)
  #endif
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
64
65
66
67
68
69
70
71
72
  /*
   * Per memcg event counter is incremented at every pagein/pageout. This counter
   * is used for trigger some periodic events. This is straightforward and better
   * than using jiffies etc. to handle periodic memcg event.
   *
   * These values will be used as !((event) & ((1 <<(thresh)) - 1))
   */
  #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
  #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
73

8cdea7c05   Balbir Singh   Memory controller...
74
  /*
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
75
76
77
78
79
80
81
   * Statistics for memory cgroup.
   */
  enum mem_cgroup_stat_index {
  	/*
  	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  	 */
  	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
d69b042f3   Balbir Singh   memcg: add file-b...
82
  	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
d8046582d   KAMEZAWA Hiroyuki   memcg: make memcg...
83
  	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
55e462b05   Balaji Rao   memcg: simple sta...
84
85
  	MEM_CGROUP_STAT_PGPGIN_COUNT,	/* # of pages paged in */
  	MEM_CGROUP_STAT_PGPGOUT_COUNT,	/* # of pages paged out */
0c3e73e84   Balbir Singh   memcg: improve re...
86
  	MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
87
  	MEM_CGROUP_EVENTS,	/* incremented at every  pagein/pageout */
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
88
89
90
91
92
93
  
  	MEM_CGROUP_STAT_NSTATS,
  };
  
  struct mem_cgroup_stat_cpu {
  	s64 count[MEM_CGROUP_STAT_NSTATS];
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
94
  };
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
95
  /*
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
96
97
   * per-zone information in memory controller.
   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
98
  struct mem_cgroup_per_zone {
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
99
100
101
  	/*
  	 * spin_lock to protect the per cgroup LRU
  	 */
b69408e88   Christoph Lameter   vmscan: Use an in...
102
103
  	struct list_head	lists[NR_LRU_LISTS];
  	unsigned long		count[NR_LRU_LISTS];
3e2f41f1f   KOSAKI Motohiro   memcg: add zone_r...
104
105
  
  	struct zone_reclaim_stat reclaim_stat;
f64c3f549   Balbir Singh   memory controller...
106
107
108
109
  	struct rb_node		tree_node;	/* RB tree node */
  	unsigned long long	usage_in_excess;/* Set to the value by which */
  						/* the soft limit is exceeded*/
  	bool			on_tree;
4e4169535   Balbir Singh   memory controller...
110
111
  	struct mem_cgroup	*mem;		/* Back pointer, we cannot */
  						/* use container_of	   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
112
113
114
115
116
117
118
119
120
121
122
123
124
  };
  /* Macro for accessing counter */
  #define MEM_CGROUP_ZSTAT(mz, idx)	((mz)->count[(idx)])
  
  struct mem_cgroup_per_node {
  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_lru_info {
  	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
  };
  
  /*
f64c3f549   Balbir Singh   memory controller...
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
   * Cgroups above their limits are maintained in a RB-Tree, independent of
   * their hierarchy representation
   */
  
  struct mem_cgroup_tree_per_zone {
  	struct rb_root rb_root;
  	spinlock_t lock;
  };
  
  struct mem_cgroup_tree_per_node {
  	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_tree {
  	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
  };
  
  static struct mem_cgroup_tree soft_limit_tree __read_mostly;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
143
144
145
146
  struct mem_cgroup_threshold {
  	struct eventfd_ctx *eventfd;
  	u64 threshold;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
147
  /* For threshold */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
148
149
  struct mem_cgroup_threshold_ary {
  	/* An array index points to threshold just below usage. */
5407a5625   Phil Carmody   mm: remove unnece...
150
  	int current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
151
152
153
154
155
  	/* Size of entries[] */
  	unsigned int size;
  	/* Array of thresholds */
  	struct mem_cgroup_threshold entries[0];
  };
2c488db27   Kirill A. Shutemov   memcg: clean up m...
156
157
158
159
160
161
162
163
164
165
166
  
  struct mem_cgroup_thresholds {
  	/* Primary thresholds array */
  	struct mem_cgroup_threshold_ary *primary;
  	/*
  	 * Spare threshold array.
  	 * This is needed to make mem_cgroup_unregister_event() "never fail".
  	 * It must be able to store at least primary->size - 1 entries.
  	 */
  	struct mem_cgroup_threshold_ary *spare;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
167
168
169
170
171
  /* for OOM */
  struct mem_cgroup_eventfd_list {
  	struct list_head list;
  	struct eventfd_ctx *eventfd;
  };
2e72b6347   Kirill A. Shutemov   memcg: implement ...
172

2e72b6347   Kirill A. Shutemov   memcg: implement ...
173
  static void mem_cgroup_threshold(struct mem_cgroup *mem);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
174
  static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
175

f64c3f549   Balbir Singh   memory controller...
176
  /*
8cdea7c05   Balbir Singh   Memory controller...
177
178
179
180
181
182
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
   * statistics based on the statistics developed by Rik Van Riel for clock-pro,
   * to help the administrator determine what knobs to tune.
   *
   * TODO: Add a water mark for the memory controller. Reclaim will begin when
8a9f3ccd2   Balbir Singh   Memory controller...
183
184
185
   * we hit the water mark. May be even add a low water mark, such that
   * no reclaim occurs from a cgroup at it's low water mark, this is
   * a feature that will be implemented much later in the future.
8cdea7c05   Balbir Singh   Memory controller...
186
187
188
189
190
191
192
   */
  struct mem_cgroup {
  	struct cgroup_subsys_state css;
  	/*
  	 * the counter to account for memory usage
  	 */
  	struct res_counter res;
78fb74669   Pavel Emelianov   Memory controller...
193
  	/*
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
194
195
196
197
  	 * the counter to account for mem+swap usage.
  	 */
  	struct res_counter memsw;
  	/*
78fb74669   Pavel Emelianov   Memory controller...
198
199
  	 * Per cgroup active and inactive list, similar to the
  	 * per zone LRU lists.
78fb74669   Pavel Emelianov   Memory controller...
200
  	 */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
201
  	struct mem_cgroup_lru_info info;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
202

2733c06ac   KOSAKI Motohiro   memcg: protect pr...
203
204
205
206
  	/*
  	  protect against reclaim related member.
  	*/
  	spinlock_t reclaim_param_lock;
6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
207
  	int	prev_priority;	/* for recording reclaim priority */
6d61ef409   Balbir Singh   memcg: memory cgr...
208
209
  
  	/*
af901ca18   André Goddard Rosa   tree-wide: fix as...
210
  	 * While reclaiming in a hierarchy, we cache the last child we
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
211
  	 * reclaimed from.
6d61ef409   Balbir Singh   memcg: memory cgr...
212
  	 */
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
213
  	int last_scanned_child;
18f59ea7d   Balbir Singh   memcg: memory cgr...
214
215
216
217
  	/*
  	 * Should the accounting and control be hierarchical, per subtree?
  	 */
  	bool use_hierarchy;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
218
  	atomic_t	oom_lock;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
219
  	atomic_t	refcnt;
14797e236   KOSAKI Motohiro   memcg: add inacti...
220

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
221
  	unsigned int	swappiness;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
222
223
  	/* OOM-Killer disable */
  	int		oom_kill_disable;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
224

22a668d7c   KAMEZAWA Hiroyuki   memcg: fix behavi...
225
226
  	/* set when res.limit == memsw.limit */
  	bool		memsw_is_minimum;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
227
228
229
230
  	/* protect arrays of thresholds */
  	struct mutex thresholds_lock;
  
  	/* thresholds for memory usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
231
  	struct mem_cgroup_thresholds thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
232

2e72b6347   Kirill A. Shutemov   memcg: implement ...
233
  	/* thresholds for mem+swap usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
234
  	struct mem_cgroup_thresholds memsw_thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
235

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
236
237
  	/* For oom notifier event fd */
  	struct list_head oom_notify;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
238
  	/*
7dc74be03   Daisuke Nishimura   memcg: add interf...
239
240
241
242
  	 * Should we move charges of a task when a task is moved into this
  	 * mem_cgroup ? And what type of charges should we move ?
  	 */
  	unsigned long 	move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
243
  	/*
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
244
  	 * percpu counter.
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
245
  	 */
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
246
  	struct mem_cgroup_stat_cpu *stat;
8cdea7c05   Balbir Singh   Memory controller...
247
  };
7dc74be03   Daisuke Nishimura   memcg: add interf...
248
249
250
251
252
253
  /* Stuffs for move charges at task migration. */
  /*
   * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
   * left-shifted bitmap of these types.
   */
  enum move_type {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
254
  	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
87946a722   Daisuke Nishimura   memcg: move charg...
255
  	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
7dc74be03   Daisuke Nishimura   memcg: add interf...
256
257
  	NR_MOVE_TYPE,
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
258
259
260
261
262
  /* "mc" and its members are protected by cgroup_mutex */
  static struct move_charge_struct {
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
  	unsigned long precharge;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
263
  	unsigned long moved_charge;
483c30b51   Daisuke Nishimura   memcg: improve pe...
264
  	unsigned long moved_swap;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
265
266
267
268
269
  	struct task_struct *moving_task;	/* a task moving charges */
  	wait_queue_head_t waitq;		/* a waitq for other context */
  } mc = {
  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
270

90254a658   Daisuke Nishimura   memcg: clean up m...
271
272
273
274
275
  static bool move_anon(void)
  {
  	return test_bit(MOVE_CHARGE_TYPE_ANON,
  					&mc.to->move_charge_at_immigrate);
  }
87946a722   Daisuke Nishimura   memcg: move charg...
276
277
278
279
280
  static bool move_file(void)
  {
  	return test_bit(MOVE_CHARGE_TYPE_FILE,
  					&mc.to->move_charge_at_immigrate);
  }
4e4169535   Balbir Singh   memory controller...
281
282
283
284
285
286
  /*
   * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
  #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		(100)
  #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	(2)
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
287
288
289
  enum charge_type {
  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
  	MEM_CGROUP_CHARGE_TYPE_MAPPED,
4f98a2fee   Rik van Riel   vmscan: split LRU...
290
  	MEM_CGROUP_CHARGE_TYPE_SHMEM,	/* used by page migration of shmem */
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
291
  	MEM_CGROUP_CHARGE_TYPE_FORCE,	/* used by force_empty */
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
292
  	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
293
  	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
294
295
  	NR_CHARGE_TYPE,
  };
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
296
297
298
  /* only for here (for easy reading.) */
  #define PCGF_CACHE	(1UL << PCG_CACHE)
  #define PCGF_USED	(1UL << PCG_USED)
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
299
  #define PCGF_LOCK	(1UL << PCG_LOCK)
4b3bde4c9   Balbir Singh   memcg: remove the...
300
301
  /* Not used, but added here for completeness */
  #define PCGF_ACCT	(1UL << PCG_ACCT)
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
302

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
303
304
305
  /* for encoding cft->private value on file */
  #define _MEM			(0)
  #define _MEMSWAP		(1)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
306
  #define _OOM_TYPE		(2)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
307
308
309
  #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
  #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
  #define MEMFILE_ATTR(val)	((val) & 0xffff)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
310
311
  /* Used for OOM nofiier */
  #define OOM_CONTROL		(0)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
312

75822b449   Balbir Singh   memory controller...
313
314
315
316
317
318
319
  /*
   * Reclaim flags for mem_cgroup_hierarchical_reclaim
   */
  #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
  #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
  #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
  #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
4e4169535   Balbir Singh   memory controller...
320
321
  #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
  #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
75822b449   Balbir Singh   memory controller...
322

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
323
324
  static void mem_cgroup_get(struct mem_cgroup *mem);
  static void mem_cgroup_put(struct mem_cgroup *mem);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
325
  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
326
  static void drain_all_stock_async(void);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
327

f64c3f549   Balbir Singh   memory controller...
328
329
330
331
332
  static struct mem_cgroup_per_zone *
  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
  {
  	return &mem->info.nodeinfo[nid]->zoneinfo[zid];
  }
d324236b3   Wu Fengguang   memcg: add access...
333
334
335
336
  struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
  {
  	return &mem->css;
  }
f64c3f549   Balbir Singh   memory controller...
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
  static struct mem_cgroup_per_zone *
  page_cgroup_zoneinfo(struct page_cgroup *pc)
  {
  	struct mem_cgroup *mem = pc->mem_cgroup;
  	int nid = page_cgroup_nid(pc);
  	int zid = page_cgroup_zid(pc);
  
  	if (!mem)
  		return NULL;
  
  	return mem_cgroup_zoneinfo(mem, nid, zid);
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_node_zone(int nid, int zid)
  {
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_from_page(struct page *page)
  {
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
  
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static void
4e4169535   Balbir Singh   memory controller...
366
  __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
f64c3f549   Balbir Singh   memory controller...
367
  				struct mem_cgroup_per_zone *mz,
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
368
369
  				struct mem_cgroup_tree_per_zone *mctz,
  				unsigned long long new_usage_in_excess)
f64c3f549   Balbir Singh   memory controller...
370
371
372
373
374
375
376
  {
  	struct rb_node **p = &mctz->rb_root.rb_node;
  	struct rb_node *parent = NULL;
  	struct mem_cgroup_per_zone *mz_node;
  
  	if (mz->on_tree)
  		return;
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
377
378
379
  	mz->usage_in_excess = new_usage_in_excess;
  	if (!mz->usage_in_excess)
  		return;
f64c3f549   Balbir Singh   memory controller...
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
  	while (*p) {
  		parent = *p;
  		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
  					tree_node);
  		if (mz->usage_in_excess < mz_node->usage_in_excess)
  			p = &(*p)->rb_left;
  		/*
  		 * We can't avoid mem cgroups that are over their soft
  		 * limit by the same amount
  		 */
  		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
  			p = &(*p)->rb_right;
  	}
  	rb_link_node(&mz->tree_node, parent, p);
  	rb_insert_color(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = true;
4e4169535   Balbir Singh   memory controller...
396
397
398
399
400
401
402
403
404
405
406
407
408
409
  }
  
  static void
  __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
  				struct mem_cgroup_per_zone *mz,
  				struct mem_cgroup_tree_per_zone *mctz)
  {
  	if (!mz->on_tree)
  		return;
  	rb_erase(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = false;
  }
  
  static void
f64c3f549   Balbir Singh   memory controller...
410
411
412
413
414
  mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
  				struct mem_cgroup_per_zone *mz,
  				struct mem_cgroup_tree_per_zone *mctz)
  {
  	spin_lock(&mctz->lock);
4e4169535   Balbir Singh   memory controller...
415
  	__mem_cgroup_remove_exceeded(mem, mz, mctz);
f64c3f549   Balbir Singh   memory controller...
416
417
  	spin_unlock(&mctz->lock);
  }
f64c3f549   Balbir Singh   memory controller...
418
419
420
  
  static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
  {
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
421
  	unsigned long long excess;
f64c3f549   Balbir Singh   memory controller...
422
423
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
424
425
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
f64c3f549   Balbir Singh   memory controller...
426
427
428
  	mctz = soft_limit_tree_from_page(page);
  
  	/*
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
429
430
  	 * Necessary to update all ancestors when hierarchy is used.
  	 * because their event counter is not touched.
f64c3f549   Balbir Singh   memory controller...
431
  	 */
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
432
433
  	for (; mem; mem = parent_mem_cgroup(mem)) {
  		mz = mem_cgroup_zoneinfo(mem, nid, zid);
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
434
  		excess = res_counter_soft_limit_excess(&mem->res);
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
435
436
437
438
  		/*
  		 * We have to update the tree if mz is on RB-tree or
  		 * mem is over its softlimit.
  		 */
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
439
  		if (excess || mz->on_tree) {
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
440
441
442
443
444
  			spin_lock(&mctz->lock);
  			/* if on-tree, remove it */
  			if (mz->on_tree)
  				__mem_cgroup_remove_exceeded(mem, mz, mctz);
  			/*
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
445
446
  			 * Insert again. mz->usage_in_excess will be updated.
  			 * If excess is 0, no tree ops.
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
447
  			 */
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
448
  			__mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
449
450
  			spin_unlock(&mctz->lock);
  		}
f64c3f549   Balbir Singh   memory controller...
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
  	}
  }
  
  static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
  {
  	int node, zone;
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
  
  	for_each_node_state(node, N_POSSIBLE) {
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  			mz = mem_cgroup_zoneinfo(mem, node, zone);
  			mctz = soft_limit_tree_node_zone(node, zone);
  			mem_cgroup_remove_exceeded(mem, mz, mctz);
  		}
  	}
  }
4e4169535   Balbir Singh   memory controller...
468
469
470
471
472
473
474
475
476
  static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
  {
  	return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
  }
  
  static struct mem_cgroup_per_zone *
  __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct rb_node *rightmost = NULL;
26251eaf9   KAMEZAWA Hiroyuki   memcg: fix refcnt...
477
  	struct mem_cgroup_per_zone *mz;
4e4169535   Balbir Singh   memory controller...
478
479
  
  retry:
26251eaf9   KAMEZAWA Hiroyuki   memcg: fix refcnt...
480
  	mz = NULL;
4e4169535   Balbir Singh   memory controller...
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
  	rightmost = rb_last(&mctz->rb_root);
  	if (!rightmost)
  		goto done;		/* Nothing to reclaim from */
  
  	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
  	/*
  	 * Remove the node now but someone else can add it back,
  	 * we will to add it back at the end of reclaim to its correct
  	 * position in the tree.
  	 */
  	__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
  	if (!res_counter_soft_limit_excess(&mz->mem->res) ||
  		!css_tryget(&mz->mem->css))
  		goto retry;
  done:
  	return mz;
  }
  
  static struct mem_cgroup_per_zone *
  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct mem_cgroup_per_zone *mz;
  
  	spin_lock(&mctz->lock);
  	mz = __mem_cgroup_largest_soft_limit_node(mctz);
  	spin_unlock(&mctz->lock);
  	return mz;
  }
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
  static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
  		enum mem_cgroup_stat_index idx)
  {
  	int cpu;
  	s64 val = 0;
  
  	for_each_possible_cpu(cpu)
  		val += per_cpu(mem->stat->count[idx], cpu);
  	return val;
  }
  
  static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
  {
  	s64 ret;
  
  	ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
  	ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
  	return ret;
  }
0c3e73e84   Balbir Singh   memcg: improve re...
528
529
530
531
  static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
  					 bool charge)
  {
  	int val = (charge) ? 1 : -1;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
532
  	this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
0c3e73e84   Balbir Singh   memcg: improve re...
533
  }
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
534
535
536
  static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
  					 struct page_cgroup *pc,
  					 bool charge)
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
537
  {
0c3e73e84   Balbir Singh   memcg: improve re...
538
  	int val = (charge) ? 1 : -1;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
539

c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
540
  	preempt_disable();
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
541
  	if (PageCgroupCache(pc))
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
542
  		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
543
  	else
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
544
  		__this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
55e462b05   Balaji Rao   memcg: simple sta...
545
546
  
  	if (charge)
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
547
  		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
55e462b05   Balaji Rao   memcg: simple sta...
548
  	else
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
549
  		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
550
  	__this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
551

c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
552
  	preempt_enable();
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
553
  }
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
554
  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
b69408e88   Christoph Lameter   vmscan: Use an in...
555
  					enum lru_list idx)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
556
557
558
559
560
561
562
563
564
565
566
  {
  	int nid, zid;
  	struct mem_cgroup_per_zone *mz;
  	u64 total = 0;
  
  	for_each_online_node(nid)
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			mz = mem_cgroup_zoneinfo(mem, nid, zid);
  			total += MEM_CGROUP_ZSTAT(mz, idx);
  		}
  	return total;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
567
  }
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
  static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
  {
  	s64 val;
  
  	val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
  
  	return !(val & ((1 << event_mask_shift) - 1));
  }
  
  /*
   * Check events in order.
   *
   */
  static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
  {
  	/* threshold event is triggered in finer grain than soft limit */
  	if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
  		mem_cgroup_threshold(mem);
  		if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
  			mem_cgroup_update_tree(mem, page);
  	}
  }
d5b69e38f   Hugh Dickins   memcg: memcontrol...
590
  static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
8cdea7c05   Balbir Singh   Memory controller...
591
592
593
594
595
  {
  	return container_of(cgroup_subsys_state(cont,
  				mem_cgroup_subsys_id), struct mem_cgroup,
  				css);
  }
cf475ad28   Balbir Singh   cgroups: add an o...
596
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb74669   Pavel Emelianov   Memory controller...
597
  {
31a78f23b   Balbir Singh   mm owner: fix rac...
598
599
600
601
602
603
604
  	/*
  	 * mm_update_next_owner() may clear mm->owner to NULL
  	 * if it races with swapoff, page migration, etc.
  	 * So this can be called with p == NULL.
  	 */
  	if (unlikely(!p))
  		return NULL;
78fb74669   Pavel Emelianov   Memory controller...
605
606
607
  	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
  				struct mem_cgroup, css);
  }
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
608
609
610
  static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
  {
  	struct mem_cgroup *mem = NULL;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
611
612
613
  
  	if (!mm)
  		return NULL;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
614
615
616
617
618
619
620
621
622
623
624
625
626
627
  	/*
  	 * Because we have no locks, mm->owner's may be being moved to other
  	 * cgroup. We use css_tryget() here even if this looks
  	 * pessimistic (rather than adding locks here).
  	 */
  	rcu_read_lock();
  	do {
  		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
  		if (unlikely(!mem))
  			break;
  	} while (!css_tryget(&mem->css));
  	rcu_read_unlock();
  	return mem;
  }
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
  /*
   * Call callback function against all cgroup under hierarchy tree.
   */
  static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
  			  int (*func)(struct mem_cgroup *, void *))
  {
  	int found, ret, nextid;
  	struct cgroup_subsys_state *css;
  	struct mem_cgroup *mem;
  
  	if (!root->use_hierarchy)
  		return (*func)(root, data);
  
  	nextid = 1;
  	do {
  		ret = 0;
  		mem = NULL;
  
  		rcu_read_lock();
  		css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
  				   &found);
  		if (css && css_tryget(css))
  			mem = container_of(css, struct mem_cgroup, css);
  		rcu_read_unlock();
  
  		if (mem) {
  			ret = (*func)(mem, data);
  			css_put(&mem->css);
  		}
  		nextid = found + 1;
  	} while (!ret && css);
  
  	return ret;
  }
4b3bde4c9   Balbir Singh   memcg: remove the...
662
663
664
665
  static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
  {
  	return (mem == root_mem_cgroup);
  }
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
666
667
668
669
670
671
672
673
674
675
676
677
678
  /*
   * Following LRU functions are allowed to be used without PCG_LOCK.
   * Operations are called by routine of global LRU independently from memcg.
   * What we have to take care of here is validness of pc->mem_cgroup.
   *
   * Changes to pc->mem_cgroup happens when
   * 1. charge
   * 2. moving account
   * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
   * It is added to LRU before charge.
   * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
   * When moving account, the page is not on LRU. It's isolated.
   */
4f98a2fee   Rik van Riel   vmscan: split LRU...
679

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
680
681
682
  void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
  {
  	struct page_cgroup *pc;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
683
  	struct mem_cgroup_per_zone *mz;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
684

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
685
  	if (mem_cgroup_disabled())
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
686
687
688
  		return;
  	pc = lookup_page_cgroup(page);
  	/* can happen while we handle swapcache. */
4b3bde4c9   Balbir Singh   memcg: remove the...
689
  	if (!TestClearPageCgroupAcctLRU(pc))
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
690
  		return;
4b3bde4c9   Balbir Singh   memcg: remove the...
691
  	VM_BUG_ON(!pc->mem_cgroup);
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
692
693
694
695
  	/*
  	 * We don't check PCG_USED bit. It's cleared when the "page" is finally
  	 * removed from global LRU.
  	 */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
696
  	mz = page_cgroup_zoneinfo(pc);
b69408e88   Christoph Lameter   vmscan: Use an in...
697
  	MEM_CGROUP_ZSTAT(mz, lru) -= 1;
4b3bde4c9   Balbir Singh   memcg: remove the...
698
699
700
  	if (mem_cgroup_is_root(pc->mem_cgroup))
  		return;
  	VM_BUG_ON(list_empty(&pc->lru));
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
701
702
  	list_del_init(&pc->lru);
  	return;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
703
  }
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
704
  void mem_cgroup_del_lru(struct page *page)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
705
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
706
707
  	mem_cgroup_del_lru_list(page, page_lru(page));
  }
b69408e88   Christoph Lameter   vmscan: Use an in...
708

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
709
710
711
712
  void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
  {
  	struct mem_cgroup_per_zone *mz;
  	struct page_cgroup *pc;
b69408e88   Christoph Lameter   vmscan: Use an in...
713

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
714
  	if (mem_cgroup_disabled())
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
715
  		return;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
716

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
717
  	pc = lookup_page_cgroup(page);
bd112db87   Daisuke Nishimura   memcg: fix mem_cg...
718
719
720
721
  	/*
  	 * Used bit is set without atomic ops but after smp_wmb().
  	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
  	 */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
722
  	smp_rmb();
4b3bde4c9   Balbir Singh   memcg: remove the...
723
724
  	/* unused or root page is not rotated. */
  	if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
725
726
727
  		return;
  	mz = page_cgroup_zoneinfo(pc);
  	list_move(&pc->lru, &mz->lists[lru]);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
728
  }
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
729
  void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
66e1707bc   Balbir Singh   Memory controller...
730
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
731
732
  	struct page_cgroup *pc;
  	struct mem_cgroup_per_zone *mz;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
733

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
734
  	if (mem_cgroup_disabled())
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
735
736
  		return;
  	pc = lookup_page_cgroup(page);
4b3bde4c9   Balbir Singh   memcg: remove the...
737
  	VM_BUG_ON(PageCgroupAcctLRU(pc));
bd112db87   Daisuke Nishimura   memcg: fix mem_cg...
738
739
740
741
  	/*
  	 * Used bit is set without atomic ops but after smp_wmb().
  	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
  	 */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
742
743
  	smp_rmb();
  	if (!PageCgroupUsed(pc))
894bc3104   Lee Schermerhorn   Unevictable LRU I...
744
  		return;
b69408e88   Christoph Lameter   vmscan: Use an in...
745

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
746
  	mz = page_cgroup_zoneinfo(pc);
b69408e88   Christoph Lameter   vmscan: Use an in...
747
  	MEM_CGROUP_ZSTAT(mz, lru) += 1;
4b3bde4c9   Balbir Singh   memcg: remove the...
748
749
750
  	SetPageCgroupAcctLRU(pc);
  	if (mem_cgroup_is_root(pc->mem_cgroup))
  		return;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
751
752
  	list_add(&pc->lru, &mz->lists[lru]);
  }
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
753

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
754
  /*
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
755
756
757
758
759
   * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
   * lru because the page may.be reused after it's fully uncharged (because of
   * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
   * it again. This function is only used to charge SwapCache. It's done under
   * lock_page and expected that zone->lru_lock is never held.
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
760
   */
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
761
  static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
762
  {
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
763
764
765
766
767
768
769
770
771
772
773
774
  	unsigned long flags;
  	struct zone *zone = page_zone(page);
  	struct page_cgroup *pc = lookup_page_cgroup(page);
  
  	spin_lock_irqsave(&zone->lru_lock, flags);
  	/*
  	 * Forget old LRU when this page_cgroup is *not* used. This Used bit
  	 * is guarded by lock_page() because the page is SwapCache.
  	 */
  	if (!PageCgroupUsed(pc))
  		mem_cgroup_del_lru_list(page, page_lru(page));
  	spin_unlock_irqrestore(&zone->lru_lock, flags);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
775
  }
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
776
777
778
779
780
781
782
783
  static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
  {
  	unsigned long flags;
  	struct zone *zone = page_zone(page);
  	struct page_cgroup *pc = lookup_page_cgroup(page);
  
  	spin_lock_irqsave(&zone->lru_lock, flags);
  	/* link when the page is linked to LRU but page_cgroup isn't */
4b3bde4c9   Balbir Singh   memcg: remove the...
784
  	if (PageLRU(page) && !PageCgroupAcctLRU(pc))
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
785
786
787
  		mem_cgroup_add_lru_list(page, page_lru(page));
  	spin_unlock_irqrestore(&zone->lru_lock, flags);
  }
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
788
789
790
  void mem_cgroup_move_lists(struct page *page,
  			   enum lru_list from, enum lru_list to)
  {
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
791
  	if (mem_cgroup_disabled())
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
792
793
794
  		return;
  	mem_cgroup_del_lru_list(page, from);
  	mem_cgroup_add_lru_list(page, to);
66e1707bc   Balbir Singh   Memory controller...
795
  }
4c4a22148   David Rientjes   memcontrol: move ...
796
797
798
  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
  {
  	int ret;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
799
  	struct mem_cgroup *curr = NULL;
4c4a22148   David Rientjes   memcontrol: move ...
800
801
  
  	task_lock(task);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
802
803
804
  	rcu_read_lock();
  	curr = try_get_mem_cgroup_from_mm(task->mm);
  	rcu_read_unlock();
4c4a22148   David Rientjes   memcontrol: move ...
805
  	task_unlock(task);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
806
807
  	if (!curr)
  		return 0;
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
808
809
810
811
812
813
814
  	/*
  	 * We should check use_hierarchy of "mem" not "curr". Because checking
  	 * use_hierarchy of "curr" here make this function true if hierarchy is
  	 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
  	 * hierarchy(even if use_hierarchy is disabled in "mem").
  	 */
  	if (mem->use_hierarchy)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
815
816
817
818
  		ret = css_is_ancestor(&curr->css, &mem->css);
  	else
  		ret = (curr == mem);
  	css_put(&curr->css);
4c4a22148   David Rientjes   memcontrol: move ...
819
820
  	return ret;
  }
66e1707bc   Balbir Singh   Memory controller...
821
  /*
6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
822
823
824
825
   * prev_priority control...this will be used in memory reclaim path.
   */
  int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
  {
2733c06ac   KOSAKI Motohiro   memcg: protect pr...
826
827
828
829
830
831
832
  	int prev_priority;
  
  	spin_lock(&mem->reclaim_param_lock);
  	prev_priority = mem->prev_priority;
  	spin_unlock(&mem->reclaim_param_lock);
  
  	return prev_priority;
6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
833
834
835
836
  }
  
  void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
  {
2733c06ac   KOSAKI Motohiro   memcg: protect pr...
837
  	spin_lock(&mem->reclaim_param_lock);
6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
838
839
  	if (priority < mem->prev_priority)
  		mem->prev_priority = priority;
2733c06ac   KOSAKI Motohiro   memcg: protect pr...
840
  	spin_unlock(&mem->reclaim_param_lock);
6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
841
842
843
844
  }
  
  void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
  {
2733c06ac   KOSAKI Motohiro   memcg: protect pr...
845
  	spin_lock(&mem->reclaim_param_lock);
6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
846
  	mem->prev_priority = priority;
2733c06ac   KOSAKI Motohiro   memcg: protect pr...
847
  	spin_unlock(&mem->reclaim_param_lock);
6c48a1d04   KAMEZAWA Hiroyuki   per-zone and recl...
848
  }
c772be939   KOSAKI Motohiro   memcg: fix calcul...
849
  static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
14797e236   KOSAKI Motohiro   memcg: add inacti...
850
851
852
  {
  	unsigned long active;
  	unsigned long inactive;
c772be939   KOSAKI Motohiro   memcg: fix calcul...
853
854
  	unsigned long gb;
  	unsigned long inactive_ratio;
14797e236   KOSAKI Motohiro   memcg: add inacti...
855

14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
856
857
  	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
  	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
14797e236   KOSAKI Motohiro   memcg: add inacti...
858

c772be939   KOSAKI Motohiro   memcg: fix calcul...
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
  	gb = (inactive + active) >> (30 - PAGE_SHIFT);
  	if (gb)
  		inactive_ratio = int_sqrt(10 * gb);
  	else
  		inactive_ratio = 1;
  
  	if (present_pages) {
  		present_pages[0] = inactive;
  		present_pages[1] = active;
  	}
  
  	return inactive_ratio;
  }
  
  int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
  {
  	unsigned long active;
  	unsigned long inactive;
  	unsigned long present_pages[2];
  	unsigned long inactive_ratio;
  
  	inactive_ratio = calc_inactive_ratio(memcg, present_pages);
  
  	inactive = present_pages[0];
  	active = present_pages[1];
  
  	if (inactive * inactive_ratio < active)
14797e236   KOSAKI Motohiro   memcg: add inacti...
886
887
888
889
  		return 1;
  
  	return 0;
  }
56e49d218   Rik van Riel   vmscan: evict use...
890
891
892
893
894
895
896
897
898
899
  int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
  {
  	unsigned long active;
  	unsigned long inactive;
  
  	inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
  	active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
  
  	return (active > inactive);
  }
a3d8e0549   KOSAKI Motohiro   memcg: add mem_cg...
900
901
902
903
904
905
906
907
908
909
  unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
  				       struct zone *zone,
  				       enum lru_list lru)
  {
  	int nid = zone->zone_pgdat->node_id;
  	int zid = zone_idx(zone);
  	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
  
  	return MEM_CGROUP_ZSTAT(mz, lru);
  }
3e2f41f1f   KOSAKI Motohiro   memcg: add zone_r...
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
  struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
  						      struct zone *zone)
  {
  	int nid = zone->zone_pgdat->node_id;
  	int zid = zone_idx(zone);
  	struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
  
  	return &mz->reclaim_stat;
  }
  
  struct zone_reclaim_stat *
  mem_cgroup_get_reclaim_stat_from_page(struct page *page)
  {
  	struct page_cgroup *pc;
  	struct mem_cgroup_per_zone *mz;
  
  	if (mem_cgroup_disabled())
  		return NULL;
  
  	pc = lookup_page_cgroup(page);
bd112db87   Daisuke Nishimura   memcg: fix mem_cg...
930
931
932
933
934
935
936
  	/*
  	 * Used bit is set without atomic ops but after smp_wmb().
  	 * For making pc->mem_cgroup visible, insert smp_rmb() here.
  	 */
  	smp_rmb();
  	if (!PageCgroupUsed(pc))
  		return NULL;
3e2f41f1f   KOSAKI Motohiro   memcg: add zone_r...
937
938
939
940
941
942
  	mz = page_cgroup_zoneinfo(pc);
  	if (!mz)
  		return NULL;
  
  	return &mz->reclaim_stat;
  }
66e1707bc   Balbir Singh   Memory controller...
943
944
945
946
947
  unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
  					struct list_head *dst,
  					unsigned long *scanned, int order,
  					int mode, struct zone *z,
  					struct mem_cgroup *mem_cont,
4f98a2fee   Rik van Riel   vmscan: split LRU...
948
  					int active, int file)
66e1707bc   Balbir Singh   Memory controller...
949
950
951
952
953
954
  {
  	unsigned long nr_taken = 0;
  	struct page *page;
  	unsigned long scan;
  	LIST_HEAD(pc_list);
  	struct list_head *src;
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
955
  	struct page_cgroup *pc, *tmp;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
956
957
958
  	int nid = z->zone_pgdat->node_id;
  	int zid = zone_idx(z);
  	struct mem_cgroup_per_zone *mz;
b7c46d151   Johannes Weiner   mm: drop unneeded...
959
  	int lru = LRU_FILE * file + active;
2ffebca6a   KAMEZAWA Hiroyuki   memcg: fix lru ro...
960
  	int ret;
66e1707bc   Balbir Singh   Memory controller...
961

cf475ad28   Balbir Singh   cgroups: add an o...
962
  	BUG_ON(!mem_cont);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
963
  	mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
b69408e88   Christoph Lameter   vmscan: Use an in...
964
  	src = &mz->lists[lru];
66e1707bc   Balbir Singh   Memory controller...
965

ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
966
967
  	scan = 0;
  	list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
436c6541b   Hugh Dickins   memcgroup: fix zo...
968
  		if (scan >= nr_to_scan)
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
969
  			break;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
970
971
  
  		page = pc->page;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
972
973
  		if (unlikely(!PageCgroupUsed(pc)))
  			continue;
436c6541b   Hugh Dickins   memcgroup: fix zo...
974
  		if (unlikely(!PageLRU(page)))
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
975
  			continue;
ff7283fa3   KAMEZAWA Hiroyuki   bugfix for memory...
976

436c6541b   Hugh Dickins   memcgroup: fix zo...
977
  		scan++;
2ffebca6a   KAMEZAWA Hiroyuki   memcg: fix lru ro...
978
979
980
  		ret = __isolate_lru_page(page, mode, file);
  		switch (ret) {
  		case 0:
66e1707bc   Balbir Singh   Memory controller...
981
  			list_move(&page->lru, dst);
2ffebca6a   KAMEZAWA Hiroyuki   memcg: fix lru ro...
982
  			mem_cgroup_del_lru(page);
66e1707bc   Balbir Singh   Memory controller...
983
  			nr_taken++;
2ffebca6a   KAMEZAWA Hiroyuki   memcg: fix lru ro...
984
985
986
987
988
989
990
  			break;
  		case -EBUSY:
  			/* we don't affect global LRU but rotate in our LRU */
  			mem_cgroup_rotate_lru_list(page, page_lru(page));
  			break;
  		default:
  			break;
66e1707bc   Balbir Singh   Memory controller...
991
992
  		}
  	}
66e1707bc   Balbir Singh   Memory controller...
993
994
995
  	*scanned = scan;
  	return nr_taken;
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
996
997
  #define mem_cgroup_from_res_counter(counter, member)	\
  	container_of(counter, struct mem_cgroup, member)
b85a96c0b   Daisuke Nishimura   memcg: memory swa...
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
  static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
  {
  	if (do_swap_account) {
  		if (res_counter_check_under_limit(&mem->res) &&
  			res_counter_check_under_limit(&mem->memsw))
  			return true;
  	} else
  		if (res_counter_check_under_limit(&mem->res))
  			return true;
  	return false;
  }
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
  static unsigned int get_swappiness(struct mem_cgroup *memcg)
  {
  	struct cgroup *cgrp = memcg->css.cgroup;
  	unsigned int swappiness;
  
  	/* root ? */
  	if (cgrp->parent == NULL)
  		return vm_swappiness;
  
  	spin_lock(&memcg->reclaim_param_lock);
  	swappiness = memcg->swappiness;
  	spin_unlock(&memcg->reclaim_param_lock);
  
  	return swappiness;
  }
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1024
1025
1026
1027
1028
1029
  static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
  {
  	int *val = data;
  	(*val)++;
  	return 0;
  }
e222432bf   Balbir Singh   memcg: show memcg...
1030
1031
  
  /**
6a6135b64   Kirill A. Shutemov   memcg: typo in co...
1032
   * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
e222432bf   Balbir Singh   memcg: show memcg...
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
   * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
   * enabled
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
  	struct cgroup *task_cgrp;
  	struct cgroup *mem_cgrp;
  	/*
  	 * Need a buffer in BSS, can't rely on allocations. The code relies
  	 * on the assumption that OOM is serialized for memory controller.
  	 * If this assumption is broken, revisit this code.
  	 */
  	static char memcg_name[PATH_MAX];
  	int ret;
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1050
  	if (!memcg || !p)
e222432bf   Balbir Singh   memcg: show memcg...
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
  		return;
  
  
  	rcu_read_lock();
  
  	mem_cgrp = memcg->css.cgroup;
  	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
  
  	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
  	if (ret < 0) {
  		/*
  		 * Unfortunately, we are unable to convert to a useful name
  		 * But we'll still print out the usage information
  		 */
  		rcu_read_unlock();
  		goto done;
  	}
  	rcu_read_unlock();
  
  	printk(KERN_INFO "Task in %s killed", memcg_name);
  
  	rcu_read_lock();
  	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
  	if (ret < 0) {
  		rcu_read_unlock();
  		goto done;
  	}
  	rcu_read_unlock();
  
  	/*
  	 * Continues from above, so we don't need an KERN_ level
  	 */
  	printk(KERN_CONT " as a result of limit of %s
  ", memcg_name);
  done:
  
  	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu
  ",
  		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
  		res_counter_read_u64(&memcg->res, RES_FAILCNT));
  	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
  		"failcnt %llu
  ",
  		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
  		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
  }
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
  /*
   * This function returns the number of memcg under hierarchy tree. Returns
   * 1(self count) if no children.
   */
  static int mem_cgroup_count_children(struct mem_cgroup *mem)
  {
  	int num = 0;
   	mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
  	return num;
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1109
  /*
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
   * Visit the first child (need not be the first child as per the ordering
   * of the cgroup list, since we track last_scanned_child) of @mem and use
   * that to reclaim free pages from.
   */
  static struct mem_cgroup *
  mem_cgroup_select_victim(struct mem_cgroup *root_mem)
  {
  	struct mem_cgroup *ret = NULL;
  	struct cgroup_subsys_state *css;
  	int nextid, found;
  
  	if (!root_mem->use_hierarchy) {
  		css_get(&root_mem->css);
  		ret = root_mem;
  	}
  
  	while (!ret) {
  		rcu_read_lock();
  		nextid = root_mem->last_scanned_child + 1;
  		css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
  				   &found);
  		if (css && css_tryget(css))
  			ret = container_of(css, struct mem_cgroup, css);
  
  		rcu_read_unlock();
  		/* Updates scanning parameter */
  		spin_lock(&root_mem->reclaim_param_lock);
  		if (!css) {
  			/* this means start scan from ID:1 */
  			root_mem->last_scanned_child = 0;
  		} else
  			root_mem->last_scanned_child = found;
  		spin_unlock(&root_mem->reclaim_param_lock);
  	}
  
  	return ret;
  }
  
  /*
   * Scan the hierarchy if needed to reclaim memory. We remember the last child
   * we reclaimed from, so that we don't end up penalizing one child extensively
   * based on its position in the children list.
6d61ef409   Balbir Singh   memcg: memory cgr...
1152
1153
   *
   * root_mem is the original ancestor that we've been reclaim from.
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1154
1155
1156
   *
   * We give up and return to the caller when we visit root_mem twice.
   * (other groups can be removed while we're walking....)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1157
1158
   *
   * If shrink==true, for avoiding to free too much, this returns immedieately.
6d61ef409   Balbir Singh   memcg: memory cgr...
1159
1160
   */
  static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
4e4169535   Balbir Singh   memory controller...
1161
  						struct zone *zone,
75822b449   Balbir Singh   memory controller...
1162
1163
  						gfp_t gfp_mask,
  						unsigned long reclaim_options)
6d61ef409   Balbir Singh   memcg: memory cgr...
1164
  {
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1165
1166
1167
  	struct mem_cgroup *victim;
  	int ret, total = 0;
  	int loop = 0;
75822b449   Balbir Singh   memory controller...
1168
1169
  	bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
  	bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
4e4169535   Balbir Singh   memory controller...
1170
1171
  	bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
  	unsigned long excess = mem_cgroup_get_excess(root_mem);
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1172

22a668d7c   KAMEZAWA Hiroyuki   memcg: fix behavi...
1173
1174
1175
  	/* If memsw_is_minimum==1, swap-out is of-no-use. */
  	if (root_mem->memsw_is_minimum)
  		noswap = true;
4e4169535   Balbir Singh   memory controller...
1176
  	while (1) {
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1177
  		victim = mem_cgroup_select_victim(root_mem);
4e4169535   Balbir Singh   memory controller...
1178
  		if (victim == root_mem) {
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1179
  			loop++;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1180
1181
  			if (loop >= 1)
  				drain_all_stock_async();
4e4169535   Balbir Singh   memory controller...
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
  			if (loop >= 2) {
  				/*
  				 * If we have not been able to reclaim
  				 * anything, it might because there are
  				 * no reclaimable pages under this hierarchy
  				 */
  				if (!check_soft || !total) {
  					css_put(&victim->css);
  					break;
  				}
  				/*
  				 * We want to do more targetted reclaim.
  				 * excess >> 2 is not to excessive so as to
  				 * reclaim too much, nor too less that we keep
  				 * coming back to reclaim from this cgroup
  				 */
  				if (total >= (excess >> 2) ||
  					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
  					css_put(&victim->css);
  					break;
  				}
  			}
  		}
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
1205
  		if (!mem_cgroup_local_usage(victim)) {
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1206
1207
  			/* this cgroup's local usage == 0 */
  			css_put(&victim->css);
6d61ef409   Balbir Singh   memcg: memory cgr...
1208
1209
  			continue;
  		}
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1210
  		/* we use swappiness of local cgroup */
4e4169535   Balbir Singh   memory controller...
1211
1212
1213
1214
1215
1216
1217
  		if (check_soft)
  			ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
  				noswap, get_swappiness(victim), zone,
  				zone->zone_pgdat->node_id);
  		else
  			ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
  						noswap, get_swappiness(victim));
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1218
  		css_put(&victim->css);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1219
1220
1221
1222
1223
1224
1225
  		/*
  		 * At shrinking usage, we can't check we should stop here or
  		 * reclaim more. It's depends on callers. last_scanned_child
  		 * will work enough for keeping fairness under tree.
  		 */
  		if (shrink)
  			return ret;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1226
  		total += ret;
4e4169535   Balbir Singh   memory controller...
1227
1228
1229
1230
  		if (check_soft) {
  			if (res_counter_check_under_soft_limit(&root_mem->res))
  				return total;
  		} else if (mem_cgroup_check_under_limit(root_mem))
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1231
  			return 1 + total;
6d61ef409   Balbir Singh   memcg: memory cgr...
1232
  	}
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1233
  	return total;
6d61ef409   Balbir Singh   memcg: memory cgr...
1234
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1235
  static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1236
  {
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
  	int *val = (int *)data;
  	int x;
  	/*
  	 * Logically, we can stop scanning immediately when we find
  	 * a memcg is already locked. But condidering unlock ops and
  	 * creation/removal of memcg, scan-all is simple operation.
  	 */
  	x = atomic_inc_return(&mem->oom_lock);
  	*val = max(x, *val);
  	return 0;
  }
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
   */
  static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
  {
  	int lock_count = 0;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1255

867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1256
1257
1258
1259
1260
  	mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
  
  	if (lock_count == 1)
  		return true;
  	return false;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1261
  }
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1262

867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1263
  static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1264
  {
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1265
1266
1267
1268
1269
1270
  	/*
  	 * When a new child is created while the hierarchy is under oom,
  	 * mem_cgroup_oom_lock() may not be called. We have to use
  	 * atomic_add_unless() here.
  	 */
  	atomic_add_unless(&mem->oom_lock, -1, 0);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1271
1272
  	return 0;
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1273
1274
1275
1276
1277
1278
1279
  static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
  {
  	mem_cgroup_walk_tree(mem, NULL,	mem_cgroup_oom_unlock_cb);
  }
  
  static DEFINE_MUTEX(memcg_oom_mutex);
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
  struct oom_wait_info {
  	struct mem_cgroup *mem;
  	wait_queue_t	wait;
  };
  
  static int memcg_oom_wake_function(wait_queue_t *wait,
  	unsigned mode, int sync, void *arg)
  {
  	struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
  	struct oom_wait_info *oom_wait_info;
  
  	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
  
  	if (oom_wait_info->mem == wake_mem)
  		goto wakeup;
  	/* if no hierarchy, no match */
  	if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
  		return 0;
  	/*
  	 * Both of oom_wait_info->mem and wake_mem are stable under us.
  	 * Then we can use css_is_ancestor without taking care of RCU.
  	 */
  	if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
  	    !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
  		return 0;
  
  wakeup:
  	return autoremove_wake_function(wait, mode, sync, arg);
  }
  
  static void memcg_wakeup_oom(struct mem_cgroup *mem)
  {
  	/* for filtering, pass "mem" as argument. */
  	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
  }
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1315
1316
  static void memcg_oom_recover(struct mem_cgroup *mem)
  {
4d845ebf4   KAMEZAWA Hiroyuki   memcg: fix wake u...
1317
  	if (atomic_read(&mem->oom_lock))
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1318
1319
  		memcg_wakeup_oom(mem);
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1320
1321
1322
1323
  /*
   * try to call OOM killer. returns false if we should exit memory-reclaim loop.
   */
  bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1324
  {
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1325
  	struct oom_wait_info owait;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1326
  	bool locked, need_to_kill;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1327

dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1328
1329
1330
1331
1332
  	owait.mem = mem;
  	owait.wait.flags = 0;
  	owait.wait.func = memcg_oom_wake_function;
  	owait.wait.private = current;
  	INIT_LIST_HEAD(&owait.wait.task_list);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1333
  	need_to_kill = true;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1334
1335
1336
1337
1338
1339
1340
1341
  	/* At first, try to OOM lock hierarchy under mem.*/
  	mutex_lock(&memcg_oom_mutex);
  	locked = mem_cgroup_oom_lock(mem);
  	/*
  	 * Even if signal_pending(), we can't quit charge() loop without
  	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
  	 * under OOM is always welcomed, use TASK_KILLABLE here.
  	 */
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1342
1343
1344
1345
  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
  	if (!locked || mem->oom_kill_disable)
  		need_to_kill = false;
  	if (locked)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
1346
  		mem_cgroup_oom_notify(mem);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1347
  	mutex_unlock(&memcg_oom_mutex);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1348
1349
  	if (need_to_kill) {
  		finish_wait(&memcg_oom_waitq, &owait.wait);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1350
  		mem_cgroup_out_of_memory(mem, mask);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1351
  	} else {
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1352
  		schedule();
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1353
  		finish_wait(&memcg_oom_waitq, &owait.wait);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1354
1355
1356
  	}
  	mutex_lock(&memcg_oom_mutex);
  	mem_cgroup_oom_unlock(mem);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1357
  	memcg_wakeup_oom(mem);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1358
1359
1360
1361
1362
1363
1364
  	mutex_unlock(&memcg_oom_mutex);
  
  	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
  		return false;
  	/* Give chance to dying process */
  	schedule_timeout(1);
  	return true;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1365
  }
d69b042f3   Balbir Singh   memcg: add file-b...
1366
1367
1368
1369
  /*
   * Currently used to update mapped file statistics, but the routine can be
   * generalized to update other statistics as well.
   */
d8046582d   KAMEZAWA Hiroyuki   memcg: make memcg...
1370
  void mem_cgroup_update_file_mapped(struct page *page, int val)
d69b042f3   Balbir Singh   memcg: add file-b...
1371
1372
  {
  	struct mem_cgroup *mem;
d69b042f3   Balbir Singh   memcg: add file-b...
1373
  	struct page_cgroup *pc;
d69b042f3   Balbir Singh   memcg: add file-b...
1374
1375
1376
1377
1378
1379
  	pc = lookup_page_cgroup(page);
  	if (unlikely(!pc))
  		return;
  
  	lock_page_cgroup(pc);
  	mem = pc->mem_cgroup;
8725d5416   KAMEZAWA Hiroyuki   memcg: fix race i...
1380
  	if (!mem || !PageCgroupUsed(pc))
d69b042f3   Balbir Singh   memcg: add file-b...
1381
1382
1383
  		goto done;
  
  	/*
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
1384
  	 * Preemption is already disabled. We can use __this_cpu_xxx
d69b042f3   Balbir Singh   memcg: add file-b...
1385
  	 */
8725d5416   KAMEZAWA Hiroyuki   memcg: fix race i...
1386
1387
1388
1389
1390
1391
1392
  	if (val > 0) {
  		__this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  		SetPageCgroupFileMapped(pc);
  	} else {
  		__this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  		ClearPageCgroupFileMapped(pc);
  	}
d69b042f3   Balbir Singh   memcg: add file-b...
1393

d69b042f3   Balbir Singh   memcg: add file-b...
1394
1395
1396
  done:
  	unlock_page_cgroup(pc);
  }
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1397

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1398
  /*
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
   * size of first charge trial. "32" comes from vmscan.c's magic value.
   * TODO: maybe necessary to use big numbers in big irons.
   */
  #define CHARGE_SIZE	(32 * PAGE_SIZE)
  struct memcg_stock_pcp {
  	struct mem_cgroup *cached; /* this never be root cgroup */
  	int charge;
  	struct work_struct work;
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
  static atomic_t memcg_drain_count;
  
  /*
   * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
   * from local stock and true is returned. If the stock is 0 or charges from a
   * cgroup which is not current target, returns false. This stock will be
   * refilled.
   */
  static bool consume_stock(struct mem_cgroup *mem)
  {
  	struct memcg_stock_pcp *stock;
  	bool ret = true;
  
  	stock = &get_cpu_var(memcg_stock);
  	if (mem == stock->cached && stock->charge)
  		stock->charge -= PAGE_SIZE;
  	else /* need to call res_counter_charge */
  		ret = false;
  	put_cpu_var(memcg_stock);
  	return ret;
  }
  
  /*
   * Returns stocks cached in percpu to res_counter and reset cached information.
   */
  static void drain_stock(struct memcg_stock_pcp *stock)
  {
  	struct mem_cgroup *old = stock->cached;
  
  	if (stock->charge) {
  		res_counter_uncharge(&old->res, stock->charge);
  		if (do_swap_account)
  			res_counter_uncharge(&old->memsw, stock->charge);
  	}
  	stock->cached = NULL;
  	stock->charge = 0;
  }
  
  /*
   * This must be called under preempt disabled or must be called by
   * a thread which is pinned to local cpu.
   */
  static void drain_local_stock(struct work_struct *dummy)
  {
  	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
  	drain_stock(stock);
  }
  
  /*
   * Cache charges(val) which is from res_counter, to local per_cpu area.
320cc51d9   Greg Thelen   mm: fix typo in r...
1459
   * This will be consumed by consume_stock() function, later.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
   */
  static void refill_stock(struct mem_cgroup *mem, int val)
  {
  	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
  
  	if (stock->cached != mem) { /* reset if necessary */
  		drain_stock(stock);
  		stock->cached = mem;
  	}
  	stock->charge += val;
  	put_cpu_var(memcg_stock);
  }
  
  /*
   * Tries to drain stocked charges in other cpus. This function is asynchronous
   * and just put a work per cpu for draining localy on each cpu. Caller can
   * expects some charges will be back to res_counter later but cannot wait for
   * it.
   */
  static void drain_all_stock_async(void)
  {
  	int cpu;
  	/* This function is for scheduling "drain" in asynchronous way.
  	 * The result of "drain" is not directly handled by callers. Then,
  	 * if someone is calling drain, we don't have to call drain more.
  	 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
  	 * there is a race. We just do loose check here.
  	 */
  	if (atomic_read(&memcg_drain_count))
  		return;
  	/* Notify other cpus that system-wide "drain" is running */
  	atomic_inc(&memcg_drain_count);
  	get_online_cpus();
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
  		schedule_work_on(cpu, &stock->work);
  	}
   	put_online_cpus();
  	atomic_dec(&memcg_drain_count);
  	/* We don't wait for flush_work */
  }
  
  /* This is a synchronous drain interface. */
  static void drain_all_stock_sync(void)
  {
  	/* called when force_empty is called */
  	atomic_inc(&memcg_drain_count);
  	schedule_on_each_cpu(drain_local_stock);
  	atomic_dec(&memcg_drain_count);
  }
  
  static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
  					unsigned long action,
  					void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
  	struct memcg_stock_pcp *stock;
  
  	if (action != CPU_DEAD)
  		return NOTIFY_OK;
  	stock = &per_cpu(memcg_stock, cpu);
  	drain_stock(stock);
  	return NOTIFY_OK;
  }
  
  /*
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1526
1527
   * Unlike exported interface, "oom" parameter is added. if oom==true,
   * oom-killer can be invoked.
8a9f3ccd2   Balbir Singh   Memory controller...
1528
   */
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1529
  static int __mem_cgroup_try_charge(struct mm_struct *mm,
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
1530
  			gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
8a9f3ccd2   Balbir Singh   Memory controller...
1531
  {
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
1532
  	struct mem_cgroup *mem, *mem_over_limit;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1533
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
1534
  	struct res_counter *fail_res;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1535
  	int csize = CHARGE_SIZE;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1536

867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1537
1538
1539
1540
1541
1542
1543
1544
  	/*
  	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
  	 * in system level. So, allow to go ahead dying process in addition to
  	 * MEMDIE process.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)
  		     || fatal_signal_pending(current)))
  		goto bypass;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1545

8a9f3ccd2   Balbir Singh   Memory controller...
1546
  	/*
3be91277e   Hugh Dickins   memcgroup: tidy u...
1547
1548
  	 * We always charge the cgroup the mm_struct belongs to.
  	 * The mm_struct's mem_cgroup changes on task migration if the
8a9f3ccd2   Balbir Singh   Memory controller...
1549
1550
1551
  	 * thread group leader migrates. It's possible that mm is not
  	 * set, if so charge the init_mm (happens for pagecache usage).
  	 */
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
1552
1553
1554
  	mem = *memcg;
  	if (likely(!mem)) {
  		mem = try_get_mem_cgroup_from_mm(mm);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1555
  		*memcg = mem;
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
1556
  	} else {
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1557
  		css_get(&mem->css);
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
1558
  	}
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
1559
1560
  	if (unlikely(!mem))
  		return 0;
46f7e602f   Nikanth Karthikesan   memcg: fix build ...
1561
  	VM_BUG_ON(css_is_removed(&mem->css));
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1562
1563
  	if (mem_cgroup_is_root(mem))
  		goto done;
8a9f3ccd2   Balbir Singh   Memory controller...
1564

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
1565
  	while (1) {
0c3e73e84   Balbir Singh   memcg: improve re...
1566
  		int ret = 0;
75822b449   Balbir Singh   memory controller...
1567
  		unsigned long flags = 0;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1568

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1569
  		if (consume_stock(mem))
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
1570
  			goto done;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1571
1572
  
  		ret = res_counter_charge(&mem->res, csize, &fail_res);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
1573
1574
1575
  		if (likely(!ret)) {
  			if (!do_swap_account)
  				break;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1576
  			ret = res_counter_charge(&mem->memsw, csize, &fail_res);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
1577
1578
1579
  			if (likely(!ret))
  				break;
  			/* mem+swap counter fails */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1580
  			res_counter_uncharge(&mem->res, csize);
75822b449   Balbir Singh   memory controller...
1581
  			flags |= MEM_CGROUP_RECLAIM_NOSWAP;
6d61ef409   Balbir Singh   memcg: memory cgr...
1582
1583
1584
1585
1586
1587
  			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
  									memsw);
  		} else
  			/* mem counter fails */
  			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
  									res);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1588
1589
1590
1591
1592
  		/* reduce request size and retry */
  		if (csize > PAGE_SIZE) {
  			csize = PAGE_SIZE;
  			continue;
  		}
3be91277e   Hugh Dickins   memcgroup: tidy u...
1593
  		if (!(gfp_mask & __GFP_WAIT))
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1594
  			goto nomem;
e1a1cd590   Balbir Singh   Memory controller...
1595

4e4169535   Balbir Singh   memory controller...
1596
1597
  		ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
  						gfp_mask, flags);
4d1c62738   Daisuke Nishimura   memcg: make oom l...
1598
1599
  		if (ret)
  			continue;
66e1707bc   Balbir Singh   Memory controller...
1600
1601
  
  		/*
8869b8f6e   Hugh Dickins   memcg: memcontrol...
1602
1603
1604
1605
1606
  		 * try_to_free_mem_cgroup_pages() might not give us a full
  		 * picture of reclaim. Some pages are reclaimed and might be
  		 * moved to swap cache or just unmapped from the cgroup.
  		 * Check the limit again to see if the reclaim reduced the
  		 * current usage of the cgroup before giving up
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
1607
  		 *
8869b8f6e   Hugh Dickins   memcg: memcontrol...
1608
  		 */
b85a96c0b   Daisuke Nishimura   memcg: memory swa...
1609
1610
  		if (mem_cgroup_check_under_limit(mem_over_limit))
  			continue;
3be91277e   Hugh Dickins   memcgroup: tidy u...
1611

8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
1612
1613
1614
1615
1616
1617
1618
1619
  		/* try to avoid oom while someone is moving charge */
  		if (mc.moving_task && current != mc.moving_task) {
  			struct mem_cgroup *from, *to;
  			bool do_continue = false;
  			/*
  			 * There is a small race that "from" or "to" can be
  			 * freed by rmdir, so we use css_tryget().
  			 */
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
  			from = mc.from;
  			to = mc.to;
  			if (from && css_tryget(&from->css)) {
  				if (mem_over_limit->use_hierarchy)
  					do_continue = css_is_ancestor(
  							&from->css,
  							&mem_over_limit->css);
  				else
  					do_continue = (from == mem_over_limit);
  				css_put(&from->css);
  			}
  			if (!do_continue && to && css_tryget(&to->css)) {
  				if (mem_over_limit->use_hierarchy)
  					do_continue = css_is_ancestor(
  							&to->css,
  							&mem_over_limit->css);
  				else
  					do_continue = (to == mem_over_limit);
  				css_put(&to->css);
  			}
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
  			if (do_continue) {
  				DEFINE_WAIT(wait);
  				prepare_to_wait(&mc.waitq, &wait,
  							TASK_INTERRUPTIBLE);
  				/* moving charge context might have finished. */
  				if (mc.moving_task)
  					schedule();
  				finish_wait(&mc.waitq, &wait);
  				continue;
  			}
  		}
3be91277e   Hugh Dickins   memcgroup: tidy u...
1651
  		if (!nr_retries--) {
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1652
1653
1654
1655
1656
  			if (!oom)
  				goto nomem;
  			if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
  				nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
  				continue;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1657
  			}
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1658
1659
1660
  			/* When we reach here, current task is dying .*/
  			css_put(&mem->css);
  			goto bypass;
66e1707bc   Balbir Singh   Memory controller...
1661
  		}
8a9f3ccd2   Balbir Singh   Memory controller...
1662
  	}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1663
1664
  	if (csize > PAGE_SIZE)
  		refill_stock(mem, csize - PAGE_SIZE);
0c3e73e84   Balbir Singh   memcg: improve re...
1665
  done:
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1666
1667
1668
1669
  	return 0;
  nomem:
  	css_put(&mem->css);
  	return -ENOMEM;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1670
1671
1672
  bypass:
  	*memcg = NULL;
  	return 0;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1673
  }
8a9f3ccd2   Balbir Singh   Memory controller...
1674

a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
1675
  /*
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
1676
1677
1678
1679
   * Somemtimes we have to undo a charge we got by try_charge().
   * This function is for that and do uncharge, put css's refcnt.
   * gotten by try_charge().
   */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1680
1681
  static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
  							unsigned long count)
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
1682
1683
  {
  	if (!mem_cgroup_is_root(mem)) {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1684
  		res_counter_uncharge(&mem->res, PAGE_SIZE * count);
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
1685
  		if (do_swap_account)
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1686
1687
1688
1689
  			res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
  		WARN_ON_ONCE(count > INT_MAX);
  		__css_put(&mem->css, (int)count);
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
1690
  	}
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1691
1692
1693
1694
1695
1696
  	/* we don't need css_put for root */
  }
  
  static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
  {
  	__mem_cgroup_cancel_charge(mem, 1);
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
1697
1698
1699
  }
  
  /*
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
   * A helper function to get mem_cgroup from ID. must be called under
   * rcu_read_lock(). The caller must check css_is_removed() or some if
   * it's concern. (dropping refcnt from swap can be called against removed
   * memcg.)
   */
  static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
  {
  	struct cgroup_subsys_state *css;
  
  	/* ID 0 is unused ID */
  	if (!id)
  		return NULL;
  	css = css_lookup(&mem_cgroup_subsys, id);
  	if (!css)
  		return NULL;
  	return container_of(css, struct mem_cgroup, css);
  }
e42d9d5d4   Wu Fengguang   memcg: rename and...
1717
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
1718
  {
e42d9d5d4   Wu Fengguang   memcg: rename and...
1719
  	struct mem_cgroup *mem = NULL;
3c776e646   Daisuke Nishimura   memcg: charge swa...
1720
  	struct page_cgroup *pc;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
1721
  	unsigned short id;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
1722
  	swp_entry_t ent;
3c776e646   Daisuke Nishimura   memcg: charge swa...
1723
  	VM_BUG_ON(!PageLocked(page));
3c776e646   Daisuke Nishimura   memcg: charge swa...
1724
  	pc = lookup_page_cgroup(page);
c0bd3f63c   Daisuke Nishimura   memcg: fix try_ge...
1725
  	lock_page_cgroup(pc);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
1726
  	if (PageCgroupUsed(pc)) {
3c776e646   Daisuke Nishimura   memcg: charge swa...
1727
  		mem = pc->mem_cgroup;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
1728
1729
  		if (mem && !css_tryget(&mem->css))
  			mem = NULL;
e42d9d5d4   Wu Fengguang   memcg: rename and...
1730
  	} else if (PageSwapCache(page)) {
3c776e646   Daisuke Nishimura   memcg: charge swa...
1731
  		ent.val = page_private(page);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
1732
1733
1734
1735
1736
1737
  		id = lookup_swap_cgroup(ent);
  		rcu_read_lock();
  		mem = mem_cgroup_lookup(id);
  		if (mem && !css_tryget(&mem->css))
  			mem = NULL;
  		rcu_read_unlock();
3c776e646   Daisuke Nishimura   memcg: charge swa...
1738
  	}
c0bd3f63c   Daisuke Nishimura   memcg: fix try_ge...
1739
  	unlock_page_cgroup(pc);
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
1740
1741
  	return mem;
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1742
  /*
a5e924f5f   Daisuke Nishimura   memcg: remove mem...
1743
   * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1744
1745
1746
1747
1748
1749
1750
   * USED state. If already USED, uncharge and return.
   */
  
  static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
  				     struct page_cgroup *pc,
  				     enum charge_type ctype)
  {
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1751
1752
1753
  	/* try_charge() can return NULL to *memcg, taking care of it. */
  	if (!mem)
  		return;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1754
1755
1756
1757
  
  	lock_page_cgroup(pc);
  	if (unlikely(PageCgroupUsed(pc))) {
  		unlock_page_cgroup(pc);
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
1758
  		mem_cgroup_cancel_charge(mem);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1759
  		return;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1760
  	}
4b3bde4c9   Balbir Singh   memcg: remove the...
1761

8a9f3ccd2   Balbir Singh   Memory controller...
1762
  	pc->mem_cgroup = mem;
261fb61a8   KAMEZAWA Hiroyuki   memcg: add commen...
1763
1764
1765
1766
1767
1768
1769
  	/*
  	 * We access a page_cgroup asynchronously without lock_page_cgroup().
  	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
  	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
  	 * before USED bit, we need memory barrier here.
  	 * See mem_cgroup_add_lru_list(), etc.
   	 */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1770
  	smp_wmb();
4b3bde4c9   Balbir Singh   memcg: remove the...
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
  	switch (ctype) {
  	case MEM_CGROUP_CHARGE_TYPE_CACHE:
  	case MEM_CGROUP_CHARGE_TYPE_SHMEM:
  		SetPageCgroupCache(pc);
  		SetPageCgroupUsed(pc);
  		break;
  	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
  		ClearPageCgroupCache(pc);
  		SetPageCgroupUsed(pc);
  		break;
  	default:
  		break;
  	}
3be91277e   Hugh Dickins   memcgroup: tidy u...
1784

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1785
  	mem_cgroup_charge_statistics(mem, pc, true);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1786

52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1787
  	unlock_page_cgroup(pc);
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
1788
1789
1790
1791
1792
  	/*
  	 * "charge_statistics" updated event counter. Then, check it.
  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
  	 * if they exceeds softlimit.
  	 */
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
1793
  	memcg_check_events(mem, pc->page);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1794
  }
66e1707bc   Balbir Singh   Memory controller...
1795

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1796
  /**
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1797
   * __mem_cgroup_move_account - move account of the page
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1798
1799
1800
   * @pc:	page_cgroup of the page.
   * @from: mem_cgroup which the page is moved from.
   * @to:	mem_cgroup which the page is moved to. @from != @to.
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1801
   * @uncharge: whether we should call uncharge and css_put against @from.
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1802
1803
   *
   * The caller must confirm following.
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1804
   * - page is not on LRU (isolate_page() is useful.)
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1805
   * - the pc is locked, used, and ->mem_cgroup points to @from.
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1806
   *
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1807
1808
1809
1810
   * This function doesn't do "charge" nor css_get to new cgroup. It should be
   * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
   * true, this function does "uncharge" from old cgroup, but it doesn't if
   * @uncharge is false, so a caller should do "uncharge".
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1811
   */
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1812
  static void __mem_cgroup_move_account(struct page_cgroup *pc,
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1813
  	struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1814
  {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1815
  	VM_BUG_ON(from == to);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1816
  	VM_BUG_ON(PageLRU(pc->page));
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1817
1818
1819
  	VM_BUG_ON(!PageCgroupLocked(pc));
  	VM_BUG_ON(!PageCgroupUsed(pc));
  	VM_BUG_ON(pc->mem_cgroup != from);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1820

8725d5416   KAMEZAWA Hiroyuki   memcg: fix race i...
1821
  	if (PageCgroupFileMapped(pc)) {
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
1822
1823
1824
1825
1826
  		/* Update mapped_file data for mem_cgroup */
  		preempt_disable();
  		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  		preempt_enable();
d69b042f3   Balbir Singh   memcg: add file-b...
1827
  	}
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1828
1829
1830
1831
  	mem_cgroup_charge_statistics(from, pc, false);
  	if (uncharge)
  		/* This is not "cancel", but cancel_charge does all we need. */
  		mem_cgroup_cancel_charge(from);
d69b042f3   Balbir Singh   memcg: add file-b...
1832

854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1833
  	/* caller should have done css_get */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1834
1835
  	pc->mem_cgroup = to;
  	mem_cgroup_charge_statistics(to, pc, true);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
1836
1837
1838
  	/*
  	 * We charges against "to" which may not have any tasks. Then, "to"
  	 * can be under rmdir(). But in current implementation, caller of
4ffef5fef   Daisuke Nishimura   memcg: move charg...
1839
1840
1841
  	 * this function is just force_empty() and move charge, so it's
  	 * garanteed that "to" is never removed. So, we don't check rmdir
  	 * status here.
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
1842
  	 */
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1843
1844
1845
1846
1847
1848
1849
  }
  
  /*
   * check whether the @pc is valid for moving account and call
   * __mem_cgroup_move_account()
   */
  static int mem_cgroup_move_account(struct page_cgroup *pc,
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1850
  		struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1851
1852
1853
1854
  {
  	int ret = -EINVAL;
  	lock_page_cgroup(pc);
  	if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1855
  		__mem_cgroup_move_account(pc, from, to, uncharge);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1856
1857
1858
  		ret = 0;
  	}
  	unlock_page_cgroup(pc);
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
1859
1860
1861
1862
1863
  	/*
  	 * check events
  	 */
  	memcg_check_events(to, pc->page);
  	memcg_check_events(from, pc->page);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
  	return ret;
  }
  
  /*
   * move charges to its parent.
   */
  
  static int mem_cgroup_move_parent(struct page_cgroup *pc,
  				  struct mem_cgroup *child,
  				  gfp_t gfp_mask)
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1875
  	struct page *page = pc->page;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1876
1877
1878
  	struct cgroup *cg = child->css.cgroup;
  	struct cgroup *pcg = cg->parent;
  	struct mem_cgroup *parent;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1879
1880
1881
1882
1883
  	int ret;
  
  	/* Is ROOT ? */
  	if (!pcg)
  		return -EINVAL;
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1884
1885
1886
1887
1888
  	ret = -EBUSY;
  	if (!get_page_unless_zero(page))
  		goto out;
  	if (isolate_lru_page(page))
  		goto put;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1889

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1890
  	parent = mem_cgroup_from_cont(pcg);
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
1891
  	ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1892
  	if (ret || !parent)
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1893
  		goto put_back;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1894

854ffa8d1   Daisuke Nishimura   memcg: improve pe...
1895
1896
1897
  	ret = mem_cgroup_move_account(pc, child, parent, true);
  	if (ret)
  		mem_cgroup_cancel_charge(parent);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1898
  put_back:
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1899
  	putback_lru_page(page);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1900
  put:
40d58138f   Daisuke Nishimura   memcg: fix error ...
1901
  	put_page(page);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
1902
  out:
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1903
1904
  	return ret;
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
  /*
   * Charge the memory controller for page usage.
   * Return
   * 0 if the charge was successful
   * < 0 if the cgroup is over its limit
   */
  static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
  				gfp_t gfp_mask, enum charge_type ctype,
  				struct mem_cgroup *memcg)
  {
  	struct mem_cgroup *mem;
  	struct page_cgroup *pc;
  	int ret;
  
  	pc = lookup_page_cgroup(page);
  	/* can happen at boot */
  	if (unlikely(!pc))
  		return 0;
  	prefetchw(pc);
  
  	mem = memcg;
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
1926
  	ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1927
  	if (ret || !mem)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1928
1929
1930
  		return ret;
  
  	__mem_cgroup_commit_charge(mem, pc, ctype);
8a9f3ccd2   Balbir Singh   Memory controller...
1931
  	return 0;
8a9f3ccd2   Balbir Singh   Memory controller...
1932
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1933
1934
  int mem_cgroup_newpage_charge(struct page *page,
  			      struct mm_struct *mm, gfp_t gfp_mask)
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
1935
  {
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
1936
  	if (mem_cgroup_disabled())
cede86acd   Li Zefan   memcg: clean up c...
1937
  		return 0;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1938
1939
  	if (PageCompound(page))
  		return 0;
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
  	/*
  	 * If already mapped, we don't have to account.
  	 * If page cache, page->mapping has address_space.
  	 * But page->mapping may have out-of-use anon_vma pointer,
  	 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
  	 * is NULL.
    	 */
  	if (page_mapped(page) || (page->mapping && !PageAnon(page)))
  		return 0;
  	if (unlikely(!mm))
  		mm = &init_mm;
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
1951
  	return mem_cgroup_charge_common(page, mm, gfp_mask,
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
1952
  				MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
1953
  }
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
1954
1955
1956
  static void
  __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
  					enum charge_type ctype);
e1a1cd590   Balbir Singh   Memory controller...
1957
1958
  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
  				gfp_t gfp_mask)
8697d3319   Balbir Singh   Memory controller...
1959
  {
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
1960
1961
  	struct mem_cgroup *mem = NULL;
  	int ret;
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
1962
  	if (mem_cgroup_disabled())
cede86acd   Li Zefan   memcg: clean up c...
1963
  		return 0;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1964
1965
  	if (PageCompound(page))
  		return 0;
accf163e6   KAMEZAWA Hiroyuki   memcg: remove a r...
1966
1967
1968
1969
1970
1971
1972
1973
  	/*
  	 * Corner case handling. This is called from add_to_page_cache()
  	 * in usual. But some FS (shmem) precharges this page before calling it
  	 * and call add_to_page_cache() with GFP_NOWAIT.
  	 *
  	 * For GFP_NOWAIT case, the page may be pre-charged before calling
  	 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
  	 * charge twice. (It works but has to pay a bit larger cost.)
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
1974
1975
  	 * And when the page is SwapCache, it should take swap information
  	 * into account. This is under lock_page() now.
accf163e6   KAMEZAWA Hiroyuki   memcg: remove a r...
1976
1977
1978
  	 */
  	if (!(gfp_mask & __GFP_WAIT)) {
  		struct page_cgroup *pc;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1979
1980
1981
1982
1983
1984
1985
  
  		pc = lookup_page_cgroup(page);
  		if (!pc)
  			return 0;
  		lock_page_cgroup(pc);
  		if (PageCgroupUsed(pc)) {
  			unlock_page_cgroup(pc);
accf163e6   KAMEZAWA Hiroyuki   memcg: remove a r...
1986
1987
  			return 0;
  		}
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
1988
  		unlock_page_cgroup(pc);
accf163e6   KAMEZAWA Hiroyuki   memcg: remove a r...
1989
  	}
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
1990
  	if (unlikely(!mm && !mem))
8697d3319   Balbir Singh   Memory controller...
1991
  		mm = &init_mm;
accf163e6   KAMEZAWA Hiroyuki   memcg: remove a r...
1992

c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
1993
1994
  	if (page_is_file_cache(page))
  		return mem_cgroup_charge_common(page, mm, gfp_mask,
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
1995
  				MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
1996

83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
1997
1998
1999
2000
2001
2002
2003
2004
2005
  	/* shmem */
  	if (PageSwapCache(page)) {
  		ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
  		if (!ret)
  			__mem_cgroup_commit_charge_swapin(page, mem,
  					MEM_CGROUP_CHARGE_TYPE_SHMEM);
  	} else
  		ret = mem_cgroup_charge_common(page, mm, gfp_mask,
  					MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2006

b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2007
  	return ret;
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
2008
  }
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2009
2010
2011
  /*
   * While swap-in, try_charge -> commit or cancel, the page is locked.
   * And when try_charge() successfully returns, one refcnt to memcg without
21ae2956c   Uwe Kleine-König   tree-wide: fix ty...
2012
   * struct page_cgroup is acquired. This refcnt will be consumed by
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2013
2014
   * "commit()" or removed by "cancel()"
   */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2015
2016
2017
2018
2019
  int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
  				 struct page *page,
  				 gfp_t mask, struct mem_cgroup **ptr)
  {
  	struct mem_cgroup *mem;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2020
  	int ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2021

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2022
  	if (mem_cgroup_disabled())
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2023
2024
2025
2026
  		return 0;
  
  	if (!do_swap_account)
  		goto charge_cur_mm;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2027
2028
  	/*
  	 * A racing thread's fault, or swapoff, may have already updated
407f9c8b0   Hugh Dickins   ksm: mem cgroup c...
2029
2030
2031
  	 * the pte, and even removed page from swap cache: in those cases
  	 * do_swap_page()'s pte_same() test will fail; but there's also a
  	 * KSM case which does need to charge the page.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2032
2033
  	 */
  	if (!PageSwapCache(page))
407f9c8b0   Hugh Dickins   ksm: mem cgroup c...
2034
  		goto charge_cur_mm;
e42d9d5d4   Wu Fengguang   memcg: rename and...
2035
  	mem = try_get_mem_cgroup_from_page(page);
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2036
2037
  	if (!mem)
  		goto charge_cur_mm;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2038
  	*ptr = mem;
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
2039
  	ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2040
2041
2042
  	/* drop extra refcnt from tryget */
  	css_put(&mem->css);
  	return ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2043
2044
2045
  charge_cur_mm:
  	if (unlikely(!mm))
  		mm = &init_mm;
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
2046
  	return __mem_cgroup_try_charge(mm, mask, ptr, true);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2047
  }
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
2048
2049
2050
  static void
  __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
  					enum charge_type ctype)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2051
2052
  {
  	struct page_cgroup *pc;
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2053
  	if (mem_cgroup_disabled())
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2054
2055
2056
  		return;
  	if (!ptr)
  		return;
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2057
  	cgroup_exclude_rmdir(&ptr->css);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2058
  	pc = lookup_page_cgroup(page);
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
2059
  	mem_cgroup_lru_del_before_commit_swapcache(page);
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
2060
  	__mem_cgroup_commit_charge(ptr, pc, ctype);
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
2061
  	mem_cgroup_lru_add_after_commit_swapcache(page);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2062
2063
2064
  	/*
  	 * Now swap is on-memory. This means this page may be
  	 * counted both as mem and swap....double count.
03f3c4336   KAMEZAWA Hiroyuki   memcg: fix swap a...
2065
2066
2067
  	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
  	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
  	 * may call delete_from_swap_cache() before reach here.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2068
  	 */
03f3c4336   KAMEZAWA Hiroyuki   memcg: fix swap a...
2069
  	if (do_swap_account && PageSwapCache(page)) {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2070
  		swp_entry_t ent = {.val = page_private(page)};
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2071
  		unsigned short id;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2072
  		struct mem_cgroup *memcg;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2073
2074
2075
2076
  
  		id = swap_cgroup_record(ent, 0);
  		rcu_read_lock();
  		memcg = mem_cgroup_lookup(id);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2077
  		if (memcg) {
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2078
2079
2080
2081
  			/*
  			 * This recorded memcg can be obsolete one. So, avoid
  			 * calling css_tryget
  			 */
0c3e73e84   Balbir Singh   memcg: improve re...
2082
  			if (!mem_cgroup_is_root(memcg))
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
2083
  				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
0c3e73e84   Balbir Singh   memcg: improve re...
2084
  			mem_cgroup_swap_statistics(memcg, false);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2085
2086
  			mem_cgroup_put(memcg);
  		}
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2087
  		rcu_read_unlock();
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2088
  	}
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2089
2090
2091
2092
2093
2094
  	/*
  	 * At swapin, we may charge account against cgroup which has no tasks.
  	 * So, rmdir()->pre_destroy() can be called while we do this charge.
  	 * In that case, we need to call pre_destroy() again. check it here.
  	 */
  	cgroup_release_and_wakeup_rmdir(&ptr->css);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2095
  }
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
2096
2097
2098
2099
2100
  void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
  {
  	__mem_cgroup_commit_charge_swapin(page, ptr,
  					MEM_CGROUP_CHARGE_TYPE_MAPPED);
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2101
2102
  void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
  {
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2103
  	if (mem_cgroup_disabled())
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2104
2105
2106
  		return;
  	if (!mem)
  		return;
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2107
  	mem_cgroup_cancel_charge(mem);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2108
  }
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2109
2110
2111
2112
2113
2114
2115
2116
  static void
  __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
  {
  	struct memcg_batch_info *batch = NULL;
  	bool uncharge_memsw = true;
  	/* If swapout, usage of swap doesn't decrease */
  	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
  		uncharge_memsw = false;
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
  
  	batch = &current->memcg_batch;
  	/*
  	 * In usual, we do css_get() when we remember memcg pointer.
  	 * But in this case, we keep res->usage until end of a series of
  	 * uncharges. Then, it's ok to ignore memcg's refcnt.
  	 */
  	if (!batch->memcg)
  		batch->memcg = mem;
  	/*
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
  	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
  	 * In those cases, all pages freed continously can be expected to be in
  	 * the same cgroup and we have chance to coalesce uncharges.
  	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
  	 * because we want to do uncharge as soon as possible.
  	 */
  
  	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
  		goto direct_uncharge;
  
  	/*
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
  	 * In typical case, batch->memcg == mem. This means we can
  	 * merge a series of uncharges to an uncharge of res_counter.
  	 * If not, we uncharge res_counter ony by one.
  	 */
  	if (batch->memcg != mem)
  		goto direct_uncharge;
  	/* remember freed charge and uncharge it later */
  	batch->bytes += PAGE_SIZE;
  	if (uncharge_memsw)
  		batch->memsw_bytes += PAGE_SIZE;
  	return;
  direct_uncharge:
  	res_counter_uncharge(&mem->res, PAGE_SIZE);
  	if (uncharge_memsw)
  		res_counter_uncharge(&mem->memsw, PAGE_SIZE);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2153
2154
  	if (unlikely(batch->memcg != mem))
  		memcg_oom_recover(mem);
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2155
2156
  	return;
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2157

8697d3319   Balbir Singh   Memory controller...
2158
  /*
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2159
   * uncharge if !page_mapped(page)
8a9f3ccd2   Balbir Singh   Memory controller...
2160
   */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2161
  static struct mem_cgroup *
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2162
  __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
8a9f3ccd2   Balbir Singh   Memory controller...
2163
  {
8289546e5   Hugh Dickins   memcg: remove mem...
2164
  	struct page_cgroup *pc;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2165
  	struct mem_cgroup *mem = NULL;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
2166
  	struct mem_cgroup_per_zone *mz;
8a9f3ccd2   Balbir Singh   Memory controller...
2167

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2168
  	if (mem_cgroup_disabled())
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2169
  		return NULL;
4077960e2   Balbir Singh   memory controller...
2170

d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2171
  	if (PageSwapCache(page))
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2172
  		return NULL;
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2173

8697d3319   Balbir Singh   Memory controller...
2174
  	/*
3c541e14b   Balbir Singh   Memory controller...
2175
  	 * Check if our page_cgroup is valid
8697d3319   Balbir Singh   Memory controller...
2176
  	 */
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2177
2178
  	pc = lookup_page_cgroup(page);
  	if (unlikely(!pc || !PageCgroupUsed(pc)))
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2179
  		return NULL;
b9c565d5a   Hugh Dickins   memcg: remove cle...
2180

52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2181
  	lock_page_cgroup(pc);
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2182

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2183
  	mem = pc->mem_cgroup;
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2184
2185
2186
2187
2188
  	if (!PageCgroupUsed(pc))
  		goto unlock_out;
  
  	switch (ctype) {
  	case MEM_CGROUP_CHARGE_TYPE_MAPPED:
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
2189
  	case MEM_CGROUP_CHARGE_TYPE_DROP:
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2190
2191
  		/* See mem_cgroup_prepare_migration() */
  		if (page_mapped(page) || PageCgroupMigration(pc))
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
  			goto unlock_out;
  		break;
  	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
  		if (!PageAnon(page)) {	/* Shared memory */
  			if (page->mapping && !page_is_file_cache(page))
  				goto unlock_out;
  		} else if (page_mapped(page)) /* Anon */
  				goto unlock_out;
  		break;
  	default:
  		break;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2203
  	}
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2204

569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2205
2206
  	if (!mem_cgroup_is_root(mem))
  		__do_uncharge(mem, ctype);
0c3e73e84   Balbir Singh   memcg: improve re...
2207
2208
  	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
  		mem_cgroup_swap_statistics(mem, true);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2209
  	mem_cgroup_charge_statistics(mem, pc, false);
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
2210

52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2211
  	ClearPageCgroupUsed(pc);
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
2212
2213
2214
2215
2216
2217
  	/*
  	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
  	 * freed from LRU. This is safe because uncharged page is expected not
  	 * to be reused (freed soon). Exception is SwapCache, it's handled by
  	 * special functions.
  	 */
b9c565d5a   Hugh Dickins   memcg: remove cle...
2218

69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2219
  	mz = page_cgroup_zoneinfo(pc);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2220
  	unlock_page_cgroup(pc);
fb59e9f1e   Hugh Dickins   memcg: fix oops o...
2221

d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
2222
  	memcg_check_events(mem, page);
a7fe942e9   KAMEZAWA Hiroyuki   memcg: swapout re...
2223
2224
2225
  	/* at swapout, this memcg will be accessed to record to swap */
  	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
  		css_put(&mem->css);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
2226

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2227
  	return mem;
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2228
2229
2230
  
  unlock_out:
  	unlock_page_cgroup(pc);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2231
  	return NULL;
3c541e14b   Balbir Singh   Memory controller...
2232
  }
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2233
2234
  void mem_cgroup_uncharge_page(struct page *page)
  {
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2235
2236
2237
2238
2239
  	/* early check. */
  	if (page_mapped(page))
  		return;
  	if (page->mapping && !PageAnon(page))
  		return;
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2240
2241
2242
2243
2244
2245
  	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
  }
  
  void mem_cgroup_uncharge_cache_page(struct page *page)
  {
  	VM_BUG_ON(page_mapped(page));
b7abea963   KAMEZAWA Hiroyuki   memcg: make page-...
2246
  	VM_BUG_ON(page->mapping);
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2247
2248
  	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
  }
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
  /*
   * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
   * In that cases, pages are freed continuously and we can expect pages
   * are in the same memcg. All these calls itself limits the number of
   * pages freed at once, then uncharge_start/end() is called properly.
   * This may be called prural(2) times in a context,
   */
  
  void mem_cgroup_uncharge_start(void)
  {
  	current->memcg_batch.do_batch++;
  	/* We can do nest. */
  	if (current->memcg_batch.do_batch == 1) {
  		current->memcg_batch.memcg = NULL;
  		current->memcg_batch.bytes = 0;
  		current->memcg_batch.memsw_bytes = 0;
  	}
  }
  
  void mem_cgroup_uncharge_end(void)
  {
  	struct memcg_batch_info *batch = &current->memcg_batch;
  
  	if (!batch->do_batch)
  		return;
  
  	batch->do_batch--;
  	if (batch->do_batch) /* If stacked, do nothing. */
  		return;
  
  	if (!batch->memcg)
  		return;
  	/*
  	 * This "batch->memcg" is valid without any css_get/put etc...
  	 * bacause we hide charges behind us.
  	 */
  	if (batch->bytes)
  		res_counter_uncharge(&batch->memcg->res, batch->bytes);
  	if (batch->memsw_bytes)
  		res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2289
  	memcg_oom_recover(batch->memcg);
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2290
2291
2292
  	/* forget this pointer (for sanity check) */
  	batch->memcg = NULL;
  }
e767e0561   Daisuke Nishimura   memcg: fix deadlo...
2293
  #ifdef CONFIG_SWAP
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2294
  /*
e767e0561   Daisuke Nishimura   memcg: fix deadlo...
2295
   * called after __delete_from_swap_cache() and drop "page" account.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2296
2297
   * memcg information is recorded to swap_cgroup of "ent"
   */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
2298
2299
  void
  mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2300
2301
  {
  	struct mem_cgroup *memcg;
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
2302
2303
2304
2305
2306
2307
  	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
  
  	if (!swapout) /* this was a swap cache but the swap is unused ! */
  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
  
  	memcg = __mem_cgroup_uncharge_common(page, ctype);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2308

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2309
  	/* record memcg information */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
2310
  	if (do_swap_account && swapout && memcg) {
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2311
  		swap_cgroup_record(ent, css_id(&memcg->css));
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2312
2313
  		mem_cgroup_get(memcg);
  	}
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
2314
  	if (swapout && memcg)
a7fe942e9   KAMEZAWA Hiroyuki   memcg: swapout re...
2315
  		css_put(&memcg->css);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2316
  }
e767e0561   Daisuke Nishimura   memcg: fix deadlo...
2317
  #endif
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2318
2319
2320
2321
2322
2323
2324
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  /*
   * called from swap_entry_free(). remove record in swap_cgroup and
   * uncharge "memsw" account.
   */
  void mem_cgroup_uncharge_swap(swp_entry_t ent)
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2325
  {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2326
  	struct mem_cgroup *memcg;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2327
  	unsigned short id;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2328
2329
2330
  
  	if (!do_swap_account)
  		return;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2331
2332
2333
  	id = swap_cgroup_record(ent, 0);
  	rcu_read_lock();
  	memcg = mem_cgroup_lookup(id);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2334
  	if (memcg) {
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2335
2336
2337
2338
  		/*
  		 * We uncharge this because swap is freed.
  		 * This memcg can be obsolete one. We avoid calling css_tryget
  		 */
0c3e73e84   Balbir Singh   memcg: improve re...
2339
  		if (!mem_cgroup_is_root(memcg))
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
2340
  			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
0c3e73e84   Balbir Singh   memcg: improve re...
2341
  		mem_cgroup_swap_statistics(memcg, false);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2342
2343
  		mem_cgroup_put(memcg);
  	}
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2344
  	rcu_read_unlock();
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2345
  }
024914477   Daisuke Nishimura   memcg: move charg...
2346
2347
2348
2349
2350
2351
  
  /**
   * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   * @entry: swap entry to be moved
   * @from:  mem_cgroup which the entry is moved from
   * @to:  mem_cgroup which the entry is moved to
483c30b51   Daisuke Nishimura   memcg: improve pe...
2352
   * @need_fixup: whether we should fixup res_counters and refcounts.
024914477   Daisuke Nishimura   memcg: move charg...
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
   *
   * It succeeds only when the swap_cgroup's record for this entry is the same
   * as the mem_cgroup's id of @from.
   *
   * Returns 0 on success, -EINVAL on failure.
   *
   * The caller must have charged to @to, IOW, called res_counter_charge() about
   * both res and memsw, and called css_get().
   */
  static int mem_cgroup_move_swap_account(swp_entry_t entry,
483c30b51   Daisuke Nishimura   memcg: improve pe...
2363
  		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
024914477   Daisuke Nishimura   memcg: move charg...
2364
2365
2366
2367
2368
2369
2370
  {
  	unsigned short old_id, new_id;
  
  	old_id = css_id(&from->css);
  	new_id = css_id(&to->css);
  
  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
024914477   Daisuke Nishimura   memcg: move charg...
2371
  		mem_cgroup_swap_statistics(from, false);
483c30b51   Daisuke Nishimura   memcg: improve pe...
2372
  		mem_cgroup_swap_statistics(to, true);
024914477   Daisuke Nishimura   memcg: move charg...
2373
  		/*
483c30b51   Daisuke Nishimura   memcg: improve pe...
2374
2375
2376
2377
2378
2379
  		 * This function is only called from task migration context now.
  		 * It postpones res_counter and refcount handling till the end
  		 * of task migration(mem_cgroup_clear_mc()) for performance
  		 * improvement. But we cannot postpone mem_cgroup_get(to)
  		 * because if the process that has been moved to @to does
  		 * swap-in, the refcount of @to might be decreased to 0.
024914477   Daisuke Nishimura   memcg: move charg...
2380
  		 */
024914477   Daisuke Nishimura   memcg: move charg...
2381
  		mem_cgroup_get(to);
483c30b51   Daisuke Nishimura   memcg: improve pe...
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
  		if (need_fixup) {
  			if (!mem_cgroup_is_root(from))
  				res_counter_uncharge(&from->memsw, PAGE_SIZE);
  			mem_cgroup_put(from);
  			/*
  			 * we charged both to->res and to->memsw, so we should
  			 * uncharge to->res.
  			 */
  			if (!mem_cgroup_is_root(to))
  				res_counter_uncharge(&to->res, PAGE_SIZE);
  			css_put(&to->css);
  		}
024914477   Daisuke Nishimura   memcg: move charg...
2394
2395
2396
2397
2398
2399
  		return 0;
  	}
  	return -EINVAL;
  }
  #else
  static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
483c30b51   Daisuke Nishimura   memcg: improve pe...
2400
  		struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
024914477   Daisuke Nishimura   memcg: move charg...
2401
2402
2403
  {
  	return -EINVAL;
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2404
  #endif
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2405

ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
2406
  /*
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2407
2408
   * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
   * page belongs to.
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
2409
   */
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2410
2411
  int mem_cgroup_prepare_migration(struct page *page,
  	struct page *newpage, struct mem_cgroup **ptr)
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
2412
2413
  {
  	struct page_cgroup *pc;
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
2414
  	struct mem_cgroup *mem = NULL;
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2415
  	enum charge_type ctype;
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
2416
  	int ret = 0;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
2417

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2418
  	if (mem_cgroup_disabled())
4077960e2   Balbir Singh   memory controller...
2419
  		return 0;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2420
2421
2422
  	pc = lookup_page_cgroup(page);
  	lock_page_cgroup(pc);
  	if (PageCgroupUsed(pc)) {
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
2423
2424
  		mem = pc->mem_cgroup;
  		css_get(&mem->css);
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
  		/*
  		 * At migrating an anonymous page, its mapcount goes down
  		 * to 0 and uncharge() will be called. But, even if it's fully
  		 * unmapped, migration may fail and this page has to be
  		 * charged again. We set MIGRATION flag here and delay uncharge
  		 * until end_migration() is called
  		 *
  		 * Corner Case Thinking
  		 * A)
  		 * When the old page was mapped as Anon and it's unmap-and-freed
  		 * while migration was ongoing.
  		 * If unmap finds the old page, uncharge() of it will be delayed
  		 * until end_migration(). If unmap finds a new page, it's
  		 * uncharged when it make mapcount to be 1->0. If unmap code
  		 * finds swap_migration_entry, the new page will not be mapped
  		 * and end_migration() will find it(mapcount==0).
  		 *
  		 * B)
  		 * When the old page was mapped but migraion fails, the kernel
  		 * remaps it. A charge for it is kept by MIGRATION flag even
  		 * if mapcount goes down to 0. We can do remap successfully
  		 * without charging it again.
  		 *
  		 * C)
  		 * The "old" page is under lock_page() until the end of
  		 * migration, so, the old page itself will not be swapped-out.
  		 * If the new page is swapped out before end_migraton, our
  		 * hook to usual swap-out path will catch the event.
  		 */
  		if (PageAnon(page))
  			SetPageCgroupMigration(pc);
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
2456
  	}
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2457
  	unlock_page_cgroup(pc);
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2458
2459
2460
2461
2462
2463
  	/*
  	 * If the page is not charged at this point,
  	 * we return here.
  	 */
  	if (!mem)
  		return 0;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2464

93d5c9be1   Andrea Arcangeli   memcg: fix prepar...
2465
  	*ptr = mem;
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
  	ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
  	css_put(&mem->css);/* drop extra refcnt */
  	if (ret || *ptr == NULL) {
  		if (PageAnon(page)) {
  			lock_page_cgroup(pc);
  			ClearPageCgroupMigration(pc);
  			unlock_page_cgroup(pc);
  			/*
  			 * The old page may be fully unmapped while we kept it.
  			 */
  			mem_cgroup_uncharge_page(page);
  		}
  		return -ENOMEM;
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
2479
  	}
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
  	/*
  	 * We charge new page before it's used/mapped. So, even if unlock_page()
  	 * is called before end_migration, we can catch all events on this new
  	 * page. In the case new page is migrated but not remapped, new page's
  	 * mapcount will be finally 0 and we call uncharge in end_migration().
  	 */
  	pc = lookup_page_cgroup(newpage);
  	if (PageAnon(page))
  		ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
  	else if (page_is_file_cache(page))
  		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
  	else
  		ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
  	__mem_cgroup_commit_charge(mem, pc, ctype);
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
2494
  	return ret;
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
2495
  }
8869b8f6e   Hugh Dickins   memcg: memcontrol...
2496

69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2497
  /* remove redundant charge if migration failed*/
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2498
  void mem_cgroup_end_migration(struct mem_cgroup *mem,
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2499
  	struct page *oldpage, struct page *newpage)
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
2500
  {
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2501
  	struct page *used, *unused;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2502
  	struct page_cgroup *pc;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2503
2504
2505
  
  	if (!mem)
  		return;
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2506
  	/* blocks rmdir() */
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2507
  	cgroup_exclude_rmdir(&mem->css);
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2508
2509
  	/* at migration success, oldpage->mapping is NULL. */
  	if (oldpage->mapping) {
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2510
2511
  		used = oldpage;
  		unused = newpage;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2512
  	} else {
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2513
  		used = newpage;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2514
2515
  		unused = oldpage;
  	}
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2516
  	/*
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2517
2518
2519
  	 * We disallowed uncharge of pages under migration because mapcount
  	 * of the page goes down to zero, temporarly.
  	 * Clear the flag and check the page should be charged.
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2520
  	 */
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2521
2522
2523
2524
  	pc = lookup_page_cgroup(oldpage);
  	lock_page_cgroup(pc);
  	ClearPageCgroupMigration(pc);
  	unlock_page_cgroup(pc);
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2525

ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2526
2527
2528
2529
2530
  	if (unused != oldpage)
  		pc = lookup_page_cgroup(unused);
  	__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
  
  	pc = lookup_page_cgroup(used);
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
2531
  	/*
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2532
2533
2534
2535
2536
2537
  	 * If a page is a file cache, radix-tree replacement is very atomic
  	 * and we can skip this check. When it was an Anon page, its mapcount
  	 * goes down to 0. But because we added MIGRATION flage, it's not
  	 * uncharged yet. There are several case but page->mapcount check
  	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
  	 * check. (see prepare_charge() also)
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2538
  	 */
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2539
2540
  	if (PageAnon(used))
  		mem_cgroup_uncharge_page(used);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2541
  	/*
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2542
2543
  	 * At migration, we may charge account against cgroup which has no
  	 * tasks.
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2544
2545
2546
2547
  	 * So, rmdir()->pre_destroy() can be called while we do this charge.
  	 * In that case, we need to call pre_destroy() again. check it here.
  	 */
  	cgroup_release_and_wakeup_rmdir(&mem->css);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
2548
  }
78fb74669   Pavel Emelianov   Memory controller...
2549

cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2550
  /*
ae3abae64   Daisuke Nishimura   memcg: fix mem_cg...
2551
2552
2553
2554
2555
2556
   * A call to try to shrink memory usage on charge failure at shmem's swapin.
   * Calling hierarchical_reclaim is not enough because we should update
   * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
   * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
   * not from the memcg which this page would be charged to.
   * try_charge_swapin does all of these works properly.
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
2557
   */
ae3abae64   Daisuke Nishimura   memcg: fix mem_cg...
2558
  int mem_cgroup_shmem_charge_fallback(struct page *page,
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2559
2560
  			    struct mm_struct *mm,
  			    gfp_t gfp_mask)
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
2561
  {
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2562
  	struct mem_cgroup *mem = NULL;
ae3abae64   Daisuke Nishimura   memcg: fix mem_cg...
2563
  	int ret;
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
2564

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2565
  	if (mem_cgroup_disabled())
cede86acd   Li Zefan   memcg: clean up c...
2566
  		return 0;
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
2567

ae3abae64   Daisuke Nishimura   memcg: fix mem_cg...
2568
2569
2570
  	ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
  	if (!ret)
  		mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
2571

ae3abae64   Daisuke Nishimura   memcg: fix mem_cg...
2572
  	return ret;
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
2573
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2574
  static DEFINE_MUTEX(set_limit_mutex);
d38d2a758   KOSAKI Motohiro   mm: make mem_cgro...
2575
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2576
  				unsigned long long val)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2577
  {
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2578
  	int retry_count;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2579
  	u64 memswlimit, memlimit;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2580
  	int ret = 0;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2581
2582
  	int children = mem_cgroup_count_children(memcg);
  	u64 curusage, oldusage;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2583
  	int enlarge;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2584
2585
2586
2587
2588
2589
2590
2591
2592
  
  	/*
  	 * For keeping hierarchical_reclaim simple, how long we should retry
  	 * is depends on callers. We set our retry-count to be function
  	 * of # of children which we should visit in this loop.
  	 */
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
  
  	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2593

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2594
  	enlarge = 0;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2595
  	while (retry_count) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2596
2597
2598
2599
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
  		/*
  		 * Rather than hide all in some function, I do this in
  		 * open coded manner. You see what this really does.
  		 * We have to guarantee mem->res.limit < mem->memsw.limit.
  		 */
  		mutex_lock(&set_limit_mutex);
  		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  		if (memswlimit < val) {
  			ret = -EINVAL;
  			mutex_unlock(&set_limit_mutex);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2610
2611
  			break;
  		}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2612
2613
2614
2615
  
  		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  		if (memlimit < val)
  			enlarge = 1;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2616
  		ret = res_counter_set_limit(&memcg->res, val);
22a668d7c   KAMEZAWA Hiroyuki   memcg: fix behavi...
2617
2618
2619
2620
2621
2622
  		if (!ret) {
  			if (memswlimit == val)
  				memcg->memsw_is_minimum = true;
  			else
  				memcg->memsw_is_minimum = false;
  		}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2623
2624
2625
2626
  		mutex_unlock(&set_limit_mutex);
  
  		if (!ret)
  			break;
aa20d489c   Bob Liu   memcg: code clean...
2627
  		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
4e4169535   Balbir Singh   memory controller...
2628
  						MEM_CGROUP_RECLAIM_SHRINK);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2629
2630
2631
2632
2633
2634
  		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
  		/* Usage is reduced ? */
    		if (curusage >= oldusage)
  			retry_count--;
  		else
  			oldusage = curusage;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2635
  	}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2636
2637
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
14797e236   KOSAKI Motohiro   memcg: add inacti...
2638

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2639
2640
  	return ret;
  }
338c84310   Li Zefan   memcg: remove som...
2641
2642
  static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
  					unsigned long long val)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2643
  {
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2644
  	int retry_count;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2645
  	u64 memlimit, memswlimit, oldusage, curusage;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2646
2647
  	int children = mem_cgroup_count_children(memcg);
  	int ret = -EBUSY;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2648
  	int enlarge = 0;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2649

81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2650
2651
2652
  	/* see mem_cgroup_resize_res_limit */
   	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
  	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
  	while (retry_count) {
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
  		/*
  		 * Rather than hide all in some function, I do this in
  		 * open coded manner. You see what this really does.
  		 * We have to guarantee mem->res.limit < mem->memsw.limit.
  		 */
  		mutex_lock(&set_limit_mutex);
  		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  		if (memlimit > val) {
  			ret = -EINVAL;
  			mutex_unlock(&set_limit_mutex);
  			break;
  		}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2670
2671
2672
  		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  		if (memswlimit < val)
  			enlarge = 1;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2673
  		ret = res_counter_set_limit(&memcg->memsw, val);
22a668d7c   KAMEZAWA Hiroyuki   memcg: fix behavi...
2674
2675
2676
2677
2678
2679
  		if (!ret) {
  			if (memlimit == val)
  				memcg->memsw_is_minimum = true;
  			else
  				memcg->memsw_is_minimum = false;
  		}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2680
2681
2682
2683
  		mutex_unlock(&set_limit_mutex);
  
  		if (!ret)
  			break;
4e4169535   Balbir Singh   memory controller...
2684
  		mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
75822b449   Balbir Singh   memory controller...
2685
2686
  						MEM_CGROUP_RECLAIM_NOSWAP |
  						MEM_CGROUP_RECLAIM_SHRINK);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2687
  		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2688
  		/* Usage is reduced ? */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2689
  		if (curusage >= oldusage)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2690
  			retry_count--;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2691
2692
  		else
  			oldusage = curusage;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2693
  	}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2694
2695
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2696
2697
  	return ret;
  }
4e4169535   Balbir Singh   memory controller...
2698
2699
2700
2701
2702
2703
2704
2705
2706
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  						gfp_t gfp_mask, int nid,
  						int zid)
  {
  	unsigned long nr_reclaimed = 0;
  	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
  	unsigned long reclaimed;
  	int loop = 0;
  	struct mem_cgroup_tree_per_zone *mctz;
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
2707
  	unsigned long long excess;
4e4169535   Balbir Singh   memory controller...
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
  
  	if (order > 0)
  		return 0;
  
  	mctz = soft_limit_tree_node_zone(nid, zid);
  	/*
  	 * This loop can run a while, specially if mem_cgroup's continuously
  	 * keep exceeding their soft limit and putting the system under
  	 * pressure
  	 */
  	do {
  		if (next_mz)
  			mz = next_mz;
  		else
  			mz = mem_cgroup_largest_soft_limit_node(mctz);
  		if (!mz)
  			break;
  
  		reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
  						gfp_mask,
  						MEM_CGROUP_RECLAIM_SOFT);
  		nr_reclaimed += reclaimed;
  		spin_lock(&mctz->lock);
  
  		/*
  		 * If we failed to reclaim anything from this memory cgroup
  		 * it is time to move on to the next cgroup
  		 */
  		next_mz = NULL;
  		if (!reclaimed) {
  			do {
  				/*
  				 * Loop until we find yet another one.
  				 *
  				 * By the time we get the soft_limit lock
  				 * again, someone might have aded the
  				 * group back on the RB tree. Iterate to
  				 * make sure we get a different mem.
  				 * mem_cgroup_largest_soft_limit_node returns
  				 * NULL if no other cgroup is present on
  				 * the tree
  				 */
  				next_mz =
  				__mem_cgroup_largest_soft_limit_node(mctz);
  				if (next_mz == mz) {
  					css_put(&next_mz->mem->css);
  					next_mz = NULL;
  				} else /* next_mz == NULL or other memcg */
  					break;
  			} while (1);
  		}
4e4169535   Balbir Singh   memory controller...
2759
  		__mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
2760
  		excess = res_counter_soft_limit_excess(&mz->mem->res);
4e4169535   Balbir Singh   memory controller...
2761
2762
2763
2764
2765
2766
2767
2768
  		/*
  		 * One school of thought says that we should not add
  		 * back the node to the tree if reclaim returns 0.
  		 * But our reclaim could return 0, simply because due
  		 * to priority we are exposing a smaller subset of
  		 * memory to reclaim from. Consider this as a longer
  		 * term TODO.
  		 */
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
2769
2770
  		/* If excess == 0, no tree ops */
  		__mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
4e4169535   Balbir Singh   memory controller...
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
  		spin_unlock(&mctz->lock);
  		css_put(&mz->mem->css);
  		loop++;
  		/*
  		 * Could not reclaim anything and there are no more
  		 * mem cgroups to try or we seem to be looping without
  		 * reclaiming anything.
  		 */
  		if (!nr_reclaimed &&
  			(next_mz == NULL ||
  			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
  			break;
  	} while (!nr_reclaimed);
  	if (next_mz)
  		css_put(&next_mz->mem->css);
  	return nr_reclaimed;
  }
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
2788
  /*
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2789
   * This routine traverse page_cgroup in given list and drop them all.
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2790
2791
   * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
   */
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2792
  static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2793
  				int node, int zid, enum lru_list lru)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2794
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2795
2796
  	struct zone *zone;
  	struct mem_cgroup_per_zone *mz;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2797
  	struct page_cgroup *pc, *busy;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2798
  	unsigned long flags, loop;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
2799
  	struct list_head *list;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2800
  	int ret = 0;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
2801

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2802
2803
  	zone = &NODE_DATA(node)->node_zones[zid];
  	mz = mem_cgroup_zoneinfo(mem, node, zid);
b69408e88   Christoph Lameter   vmscan: Use an in...
2804
  	list = &mz->lists[lru];
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2805

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2806
2807
2808
2809
2810
2811
  	loop = MEM_CGROUP_ZSTAT(mz, lru);
  	/* give some margin against EBUSY etc...*/
  	loop += 256;
  	busy = NULL;
  	while (loop--) {
  		ret = 0;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2812
  		spin_lock_irqsave(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2813
  		if (list_empty(list)) {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2814
  			spin_unlock_irqrestore(&zone->lru_lock, flags);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2815
  			break;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2816
2817
2818
2819
  		}
  		pc = list_entry(list->prev, struct page_cgroup, lru);
  		if (busy == pc) {
  			list_move(&pc->lru, list);
648bcc771   Thiago Farina   mm/memcontrol.c: ...
2820
  			busy = NULL;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2821
  			spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2822
2823
  			continue;
  		}
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2824
  		spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2825

2c26fdd70   KAMEZAWA Hiroyuki   memcg: revert gfp...
2826
  		ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2827
  		if (ret == -ENOMEM)
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2828
  			break;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2829
2830
2831
2832
2833
2834
2835
  
  		if (ret == -EBUSY || ret == -EINVAL) {
  			/* found lock contention or "pc" is obsolete. */
  			busy = pc;
  			cond_resched();
  		} else
  			busy = NULL;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2836
  	}
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2837

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2838
2839
2840
  	if (!ret && !list_empty(list))
  		return -EBUSY;
  	return ret;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2841
2842
2843
2844
2845
2846
  }
  
  /*
   * make mem_cgroup's charge to be 0 if there is no task.
   * This enables deleting this mem_cgroup.
   */
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2847
  static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2848
  {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2849
2850
2851
  	int ret;
  	int node, zid, shrink;
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2852
  	struct cgroup *cgrp = mem->css.cgroup;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
2853

cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2854
  	css_get(&mem->css);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2855
2856
  
  	shrink = 0;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2857
2858
2859
  	/* should free all ? */
  	if (free_all)
  		goto try_to_free;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2860
  move_account:
fce664775   Daisuke Nishimura   memcg: ensure lis...
2861
  	do {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2862
  		ret = -EBUSY;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2863
2864
2865
2866
  		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
  			goto out;
  		ret = -EINTR;
  		if (signal_pending(current))
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2867
  			goto out;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2868
2869
  		/* This is for making all *used* pages to be on LRU. */
  		lru_add_drain_all();
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2870
  		drain_all_stock_sync();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2871
  		ret = 0;
299b4eaa3   KAMEZAWA Hiroyuki   memcg: NULL point...
2872
  		for_each_node_state(node, N_HIGH_MEMORY) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2873
  			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
b69408e88   Christoph Lameter   vmscan: Use an in...
2874
  				enum lru_list l;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2875
2876
  				for_each_lru(l) {
  					ret = mem_cgroup_force_empty_list(mem,
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2877
  							node, zid, l);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2878
2879
2880
  					if (ret)
  						break;
  				}
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
2881
  			}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2882
2883
2884
  			if (ret)
  				break;
  		}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2885
  		memcg_oom_recover(mem);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2886
2887
2888
  		/* it seems parent cgroup doesn't have enough mem */
  		if (ret == -ENOMEM)
  			goto try_to_free;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2889
  		cond_resched();
fce664775   Daisuke Nishimura   memcg: ensure lis...
2890
2891
  	/* "ret" should also be checked to ensure all lists are empty. */
  	} while (mem->res.usage > 0 || ret);
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2892
2893
2894
  out:
  	css_put(&mem->css);
  	return ret;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2895
2896
  
  try_to_free:
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2897
2898
  	/* returns EBUSY if there is a task or if we come here twice. */
  	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2899
2900
2901
  		ret = -EBUSY;
  		goto out;
  	}
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2902
2903
  	/* we call try-to-free pages for make this cgroup empty */
  	lru_add_drain_all();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2904
2905
2906
2907
  	/* try to free all pages in this cgroup */
  	shrink = 1;
  	while (nr_retries && mem->res.usage > 0) {
  		int progress;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2908
2909
2910
2911
2912
  
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			goto out;
  		}
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
2913
2914
  		progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
  						false, get_swappiness(mem));
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2915
  		if (!progress) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2916
  			nr_retries--;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2917
  			/* maybe some writeback is necessary */
8aa7e847d   Jens Axboe   Fix congestion_wa...
2918
  			congestion_wait(BLK_RW_ASYNC, HZ/10);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2919
  		}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2920
2921
  
  	}
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2922
  	lru_add_drain();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2923
  	/* try move_account...there may be some *locked* pages. */
fce664775   Daisuke Nishimura   memcg: ensure lis...
2924
  	goto move_account;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2925
  }
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2926
2927
2928
2929
  int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
  {
  	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
  }
18f59ea7d   Balbir Singh   memcg: memory cgr...
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
  static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
  {
  	return mem_cgroup_from_cont(cont)->use_hierarchy;
  }
  
  static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
  					u64 val)
  {
  	int retval = 0;
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
  	struct cgroup *parent = cont->parent;
  	struct mem_cgroup *parent_mem = NULL;
  
  	if (parent)
  		parent_mem = mem_cgroup_from_cont(parent);
  
  	cgroup_lock();
  	/*
af901ca18   André Goddard Rosa   tree-wide: fix as...
2948
  	 * If parent's use_hierarchy is set, we can't make any modifications
18f59ea7d   Balbir Singh   memcg: memory cgr...
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
  	 * in the child subtrees. If it is unset, then the change can
  	 * occur, provided the current cgroup has no children.
  	 *
  	 * For the root cgroup, parent_mem is NULL, we allow value to be
  	 * set if there are no children.
  	 */
  	if ((!parent_mem || !parent_mem->use_hierarchy) &&
  				(val == 1 || val == 0)) {
  		if (list_empty(&cont->children))
  			mem->use_hierarchy = val;
  		else
  			retval = -EBUSY;
  	} else
  		retval = -EINVAL;
  	cgroup_unlock();
  
  	return retval;
  }
0c3e73e84   Balbir Singh   memcg: improve re...
2967
2968
2969
2970
2971
2972
2973
2974
2975
  struct mem_cgroup_idx_data {
  	s64 val;
  	enum mem_cgroup_stat_index idx;
  };
  
  static int
  mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
  {
  	struct mem_cgroup_idx_data *d = data;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
2976
  	d->val += mem_cgroup_read_stat(mem, d->idx);
0c3e73e84   Balbir Singh   memcg: improve re...
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
  	return 0;
  }
  
  static void
  mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
  				enum mem_cgroup_stat_index idx, s64 *val)
  {
  	struct mem_cgroup_idx_data d;
  	d.idx = idx;
  	d.val = 0;
  	mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
  	*val = d.val;
  }
104f39284   Kirill A. Shutemov   memcg: extract me...
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
  static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
  {
  	u64 idx_val, val;
  
  	if (!mem_cgroup_is_root(mem)) {
  		if (!swap)
  			return res_counter_read_u64(&mem->res, RES_USAGE);
  		else
  			return res_counter_read_u64(&mem->memsw, RES_USAGE);
  	}
  
  	mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
  	val = idx_val;
  	mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
  	val += idx_val;
  
  	if (swap) {
  		mem_cgroup_get_recursive_idx_stat(mem,
  				MEM_CGROUP_STAT_SWAPOUT, &idx_val);
  		val += idx_val;
  	}
  
  	return val << PAGE_SHIFT;
  }
2c3daa722   Paul Menage   CGroup API files:...
3014
  static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
8cdea7c05   Balbir Singh   Memory controller...
3015
  {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3016
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
104f39284   Kirill A. Shutemov   memcg: extract me...
3017
  	u64 val;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3018
3019
3020
3021
3022
3023
  	int type, name;
  
  	type = MEMFILE_TYPE(cft->private);
  	name = MEMFILE_ATTR(cft->private);
  	switch (type) {
  	case _MEM:
104f39284   Kirill A. Shutemov   memcg: extract me...
3024
3025
3026
  		if (name == RES_USAGE)
  			val = mem_cgroup_usage(mem, false);
  		else
0c3e73e84   Balbir Singh   memcg: improve re...
3027
  			val = res_counter_read_u64(&mem->res, name);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3028
3029
  		break;
  	case _MEMSWAP:
104f39284   Kirill A. Shutemov   memcg: extract me...
3030
3031
3032
  		if (name == RES_USAGE)
  			val = mem_cgroup_usage(mem, true);
  		else
0c3e73e84   Balbir Singh   memcg: improve re...
3033
  			val = res_counter_read_u64(&mem->memsw, name);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3034
3035
3036
3037
3038
3039
  		break;
  	default:
  		BUG();
  		break;
  	}
  	return val;
8cdea7c05   Balbir Singh   Memory controller...
3040
  }
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3041
3042
3043
3044
  /*
   * The user of this function is...
   * RES_LIMIT.
   */
856c13aa1   Paul Menage   cgroup files: con...
3045
3046
  static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
  			    const char *buffer)
8cdea7c05   Balbir Singh   Memory controller...
3047
  {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3048
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3049
  	int type, name;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3050
3051
  	unsigned long long val;
  	int ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3052
3053
3054
  	type = MEMFILE_TYPE(cft->private);
  	name = MEMFILE_ATTR(cft->private);
  	switch (name) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3055
  	case RES_LIMIT:
4b3bde4c9   Balbir Singh   memcg: remove the...
3056
3057
3058
3059
  		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
  			ret = -EINVAL;
  			break;
  		}
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3060
3061
  		/* This function does all necessary parse...reuse it */
  		ret = res_counter_memparse_write_strategy(buffer, &val);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3062
3063
3064
  		if (ret)
  			break;
  		if (type == _MEM)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3065
  			ret = mem_cgroup_resize_limit(memcg, val);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3066
3067
  		else
  			ret = mem_cgroup_resize_memsw_limit(memcg, val);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3068
  		break;
296c81d89   Balbir Singh   memory controller...
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
  	case RES_SOFT_LIMIT:
  		ret = res_counter_memparse_write_strategy(buffer, &val);
  		if (ret)
  			break;
  		/*
  		 * For memsw, soft limits are hard to implement in terms
  		 * of semantics, for now, we support soft limits for
  		 * control without swap
  		 */
  		if (type == _MEM)
  			ret = res_counter_set_soft_limit(&memcg->res, val);
  		else
  			ret = -EINVAL;
  		break;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3083
3084
3085
3086
3087
  	default:
  		ret = -EINVAL; /* should be BUG() ? */
  		break;
  	}
  	return ret;
8cdea7c05   Balbir Singh   Memory controller...
3088
  }
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
  static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
  		unsigned long long *mem_limit, unsigned long long *memsw_limit)
  {
  	struct cgroup *cgroup;
  	unsigned long long min_limit, min_memsw_limit, tmp;
  
  	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  	cgroup = memcg->css.cgroup;
  	if (!memcg->use_hierarchy)
  		goto out;
  
  	while (cgroup->parent) {
  		cgroup = cgroup->parent;
  		memcg = mem_cgroup_from_cont(cgroup);
  		if (!memcg->use_hierarchy)
  			break;
  		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
  		min_limit = min(min_limit, tmp);
  		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  		min_memsw_limit = min(min_memsw_limit, tmp);
  	}
  out:
  	*mem_limit = min_limit;
  	*memsw_limit = min_memsw_limit;
  	return;
  }
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3116
  static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
c84872e16   Pavel Emelyanov   memcgroup: add th...
3117
3118
  {
  	struct mem_cgroup *mem;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3119
  	int type, name;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3120
3121
  
  	mem = mem_cgroup_from_cont(cont);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3122
3123
3124
  	type = MEMFILE_TYPE(event);
  	name = MEMFILE_ATTR(event);
  	switch (name) {
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3125
  	case RES_MAX_USAGE:
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3126
3127
3128
3129
  		if (type == _MEM)
  			res_counter_reset_max(&mem->res);
  		else
  			res_counter_reset_max(&mem->memsw);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3130
3131
  		break;
  	case RES_FAILCNT:
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3132
3133
3134
3135
  		if (type == _MEM)
  			res_counter_reset_failcnt(&mem->res);
  		else
  			res_counter_reset_failcnt(&mem->memsw);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3136
3137
  		break;
  	}
f64c3f549   Balbir Singh   memory controller...
3138

85cc59db1   Pavel Emelyanov   memcgroup: use tr...
3139
  	return 0;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3140
  }
7dc74be03   Daisuke Nishimura   memcg: add interf...
3141
3142
3143
3144
3145
  static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
  					struct cftype *cft)
  {
  	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
  }
024914477   Daisuke Nishimura   memcg: move charg...
3146
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
  static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
  					struct cftype *cft, u64 val)
  {
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
  
  	if (val >= (1 << NR_MOVE_TYPE))
  		return -EINVAL;
  	/*
  	 * We check this value several times in both in can_attach() and
  	 * attach(), so we need cgroup lock to prevent this value from being
  	 * inconsistent.
  	 */
  	cgroup_lock();
  	mem->move_charge_at_immigrate = val;
  	cgroup_unlock();
  
  	return 0;
  }
024914477   Daisuke Nishimura   memcg: move charg...
3165
3166
3167
3168
3169
3170
3171
  #else
  static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
  					struct cftype *cft, u64 val)
  {
  	return -ENOSYS;
  }
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
3172

14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3173
3174
3175
3176
3177
  
  /* For read statistics */
  enum {
  	MCS_CACHE,
  	MCS_RSS,
d8046582d   KAMEZAWA Hiroyuki   memcg: make memcg...
3178
  	MCS_FILE_MAPPED,
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3179
3180
  	MCS_PGPGIN,
  	MCS_PGPGOUT,
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3181
  	MCS_SWAP,
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
  	MCS_INACTIVE_ANON,
  	MCS_ACTIVE_ANON,
  	MCS_INACTIVE_FILE,
  	MCS_ACTIVE_FILE,
  	MCS_UNEVICTABLE,
  	NR_MCS_STAT,
  };
  
  struct mcs_total_stat {
  	s64 stat[NR_MCS_STAT];
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3192
  };
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3193
3194
3195
3196
3197
3198
  struct {
  	char *local_name;
  	char *total_name;
  } memcg_stat_strings[NR_MCS_STAT] = {
  	{"cache", "total_cache"},
  	{"rss", "total_rss"},
d69b042f3   Balbir Singh   memcg: add file-b...
3199
  	{"mapped_file", "total_mapped_file"},
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3200
3201
  	{"pgpgin", "total_pgpgin"},
  	{"pgpgout", "total_pgpgout"},
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3202
  	{"swap", "total_swap"},
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
  	{"inactive_anon", "total_inactive_anon"},
  	{"active_anon", "total_active_anon"},
  	{"inactive_file", "total_inactive_file"},
  	{"active_file", "total_active_file"},
  	{"unevictable", "total_unevictable"}
  };
  
  
  static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
  {
  	struct mcs_total_stat *s = data;
  	s64 val;
  
  	/* per cpu stat */
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3217
  	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3218
  	s->stat[MCS_CACHE] += val * PAGE_SIZE;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3219
  	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3220
  	s->stat[MCS_RSS] += val * PAGE_SIZE;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3221
  	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
d8046582d   KAMEZAWA Hiroyuki   memcg: make memcg...
3222
  	s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3223
  	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3224
  	s->stat[MCS_PGPGIN] += val;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3225
  	val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3226
  	s->stat[MCS_PGPGOUT] += val;
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3227
  	if (do_swap_account) {
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3228
  		val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3229
3230
  		s->stat[MCS_SWAP] += val * PAGE_SIZE;
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
  
  	/* per zone stat */
  	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
  	s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
  	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
  	s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
  	val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
  	s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
  	val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
  	s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
  	val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
  	s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
  	return 0;
  }
  
  static void
  mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
  {
  	mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
  }
c64745cf0   Paul Menage   CGroup API files:...
3251
3252
  static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
  				 struct cgroup_map_cb *cb)
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3253
  {
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3254
  	struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3255
  	struct mcs_total_stat mystat;
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3256
  	int i;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3257
3258
  	memset(&mystat, 0, sizeof(mystat));
  	mem_cgroup_get_local_stat(mem_cont, &mystat);
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3259

1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3260
3261
3262
  	for (i = 0; i < NR_MCS_STAT; i++) {
  		if (i == MCS_SWAP && !do_swap_account)
  			continue;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3263
  		cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3264
  	}
7b854121e   Lee Schermerhorn   Unevictable LRU P...
3265

14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3266
  	/* Hierarchical information */
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3267
3268
3269
3270
3271
3272
3273
  	{
  		unsigned long long limit, memsw_limit;
  		memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
  		cb->fill(cb, "hierarchical_memory_limit", limit);
  		if (do_swap_account)
  			cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
  	}
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3274

14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3275
3276
  	memset(&mystat, 0, sizeof(mystat));
  	mem_cgroup_get_total_stat(mem_cont, &mystat);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3277
3278
3279
  	for (i = 0; i < NR_MCS_STAT; i++) {
  		if (i == MCS_SWAP && !do_swap_account)
  			continue;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3280
  		cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3281
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3282

7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3283
  #ifdef CONFIG_DEBUG_VM
c772be939   KOSAKI Motohiro   memcg: fix calcul...
3284
  	cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
  
  	{
  		int nid, zid;
  		struct mem_cgroup_per_zone *mz;
  		unsigned long recent_rotated[2] = {0, 0};
  		unsigned long recent_scanned[2] = {0, 0};
  
  		for_each_online_node(nid)
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  				mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
  
  				recent_rotated[0] +=
  					mz->reclaim_stat.recent_rotated[0];
  				recent_rotated[1] +=
  					mz->reclaim_stat.recent_rotated[1];
  				recent_scanned[0] +=
  					mz->reclaim_stat.recent_scanned[0];
  				recent_scanned[1] +=
  					mz->reclaim_stat.recent_scanned[1];
  			}
  		cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
  		cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
  		cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
  		cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
  	}
  #endif
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3311
3312
  	return 0;
  }
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
  static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  
  	return get_swappiness(memcg);
  }
  
  static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
  				       u64 val)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  	struct mem_cgroup *parent;
068b38c1f   Li Zefan   memcg: fix a race...
3325

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3326
3327
3328
3329
3330
3331
3332
  	if (val > 100)
  		return -EINVAL;
  
  	if (cgrp->parent == NULL)
  		return -EINVAL;
  
  	parent = mem_cgroup_from_cont(cgrp->parent);
068b38c1f   Li Zefan   memcg: fix a race...
3333
3334
  
  	cgroup_lock();
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3335
3336
  	/* If under hierarchy, only empty-root can set this value */
  	if ((parent->use_hierarchy) ||
068b38c1f   Li Zefan   memcg: fix a race...
3337
3338
  	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
  		cgroup_unlock();
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3339
  		return -EINVAL;
068b38c1f   Li Zefan   memcg: fix a race...
3340
  	}
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3341
3342
3343
3344
  
  	spin_lock(&memcg->reclaim_param_lock);
  	memcg->swappiness = val;
  	spin_unlock(&memcg->reclaim_param_lock);
068b38c1f   Li Zefan   memcg: fix a race...
3345
  	cgroup_unlock();
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3346
3347
  	return 0;
  }
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3348
3349
3350
3351
3352
3353
3354
3355
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
  	struct mem_cgroup_threshold_ary *t;
  	u64 usage;
  	int i;
  
  	rcu_read_lock();
  	if (!swap)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3356
  		t = rcu_dereference(memcg->thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3357
  	else
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3358
  		t = rcu_dereference(memcg->memsw_thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
  
  	if (!t)
  		goto unlock;
  
  	usage = mem_cgroup_usage(memcg, swap);
  
  	/*
  	 * current_threshold points to threshold just below usage.
  	 * If it's not true, a threshold was crossed after last
  	 * call of __mem_cgroup_threshold().
  	 */
5407a5625   Phil Carmody   mm: remove unnece...
3370
  	i = t->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
  
  	/*
  	 * Iterate backward over array of thresholds starting from
  	 * current_threshold and check if a threshold is crossed.
  	 * If none of thresholds below usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* i = current_threshold + 1 */
  	i++;
  
  	/*
  	 * Iterate forward over array of thresholds starting from
  	 * current_threshold+1 and check if a threshold is crossed.
  	 * If none of thresholds above usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* Update current_threshold */
5407a5625   Phil Carmody   mm: remove unnece...
3394
  	t->current_threshold = i - 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
  unlock:
  	rcu_read_unlock();
  }
  
  static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  {
  	__mem_cgroup_threshold(memcg, false);
  	if (do_swap_account)
  		__mem_cgroup_threshold(memcg, true);
  }
  
  static int compare_thresholds(const void *a, const void *b)
  {
  	const struct mem_cgroup_threshold *_a = a;
  	const struct mem_cgroup_threshold *_b = b;
  
  	return _a->threshold - _b->threshold;
  }
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
  {
  	struct mem_cgroup_eventfd_list *ev;
  
  	list_for_each_entry(ev, &mem->oom_notify, list)
  		eventfd_signal(ev->eventfd, 1);
  	return 0;
  }
  
  static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
  {
  	mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
  }
  
  static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
  	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3429
3430
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3431
3432
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3433
3434
  	int type = MEMFILE_TYPE(cft->private);
  	u64 threshold, usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3435
  	int i, size, ret;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3436
3437
3438
3439
3440
3441
  
  	ret = res_counter_memparse_write_strategy(args, &threshold);
  	if (ret)
  		return ret;
  
  	mutex_lock(&memcg->thresholds_lock);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3442

2e72b6347   Kirill A. Shutemov   memcg: implement ...
3443
  	if (type == _MEM)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3444
  		thresholds = &memcg->thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3445
  	else if (type == _MEMSWAP)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3446
  		thresholds = &memcg->memsw_thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3447
3448
3449
3450
3451
3452
  	else
  		BUG();
  
  	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
  
  	/* Check if a threshold crossed before adding a new one */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3453
  	if (thresholds->primary)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3454
  		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3455
  	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3456
3457
  
  	/* Allocate memory for new array of thresholds */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3458
  	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3459
  			GFP_KERNEL);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3460
  	if (!new) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3461
3462
3463
  		ret = -ENOMEM;
  		goto unlock;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3464
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3465
3466
  
  	/* Copy thresholds (if any) to new array */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3467
3468
  	if (thresholds->primary) {
  		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3469
  				sizeof(struct mem_cgroup_threshold));
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3470
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3471
  	/* Add new threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3472
3473
  	new->entries[size - 1].eventfd = eventfd;
  	new->entries[size - 1].threshold = threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3474
3475
  
  	/* Sort thresholds. Registering of new threshold isn't time-critical */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3476
  	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3477
3478
3479
  			compare_thresholds, NULL);
  
  	/* Find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3480
  	new->current_threshold = -1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3481
  	for (i = 0; i < size; i++) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3482
  		if (new->entries[i].threshold < usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3483
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3484
3485
  			 * new->current_threshold will not be used until
  			 * rcu_assign_pointer(), so it's safe to increment
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3486
3487
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3488
  			++new->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3489
3490
  		}
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3491
3492
3493
3494
3495
  	/* Free old spare buffer and save old primary buffer as spare */
  	kfree(thresholds->spare);
  	thresholds->spare = thresholds->primary;
  
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3496

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3497
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3498
  	synchronize_rcu();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3499
3500
3501
3502
3503
  unlock:
  	mutex_unlock(&memcg->thresholds_lock);
  
  	return ret;
  }
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3504
  static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3505
  	struct cftype *cft, struct eventfd_ctx *eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3506
3507
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3508
3509
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3510
3511
  	int type = MEMFILE_TYPE(cft->private);
  	u64 usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3512
  	int i, j, size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3513
3514
3515
  
  	mutex_lock(&memcg->thresholds_lock);
  	if (type == _MEM)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3516
  		thresholds = &memcg->thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3517
  	else if (type == _MEMSWAP)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3518
  		thresholds = &memcg->memsw_thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
  	else
  		BUG();
  
  	/*
  	 * Something went wrong if we trying to unregister a threshold
  	 * if we don't have thresholds
  	 */
  	BUG_ON(!thresholds);
  
  	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
  
  	/* Check if a threshold crossed before removing */
  	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
  
  	/* Calculate new number of threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3534
3535
3536
  	size = 0;
  	for (i = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd != eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3537
3538
  			size++;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3539
  	new = thresholds->spare;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3540

2e72b6347   Kirill A. Shutemov   memcg: implement ...
3541
3542
  	/* Set thresholds array to NULL if we don't have thresholds */
  	if (!size) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3543
3544
  		kfree(new);
  		new = NULL;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3545
  		goto swap_buffers;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3546
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3547
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3548
3549
  
  	/* Copy thresholds and find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3550
3551
3552
  	new->current_threshold = -1;
  	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd == eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3553
  			continue;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3554
3555
  		new->entries[j] = thresholds->primary->entries[i];
  		if (new->entries[j].threshold < usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3556
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3557
  			 * new->current_threshold will not be used
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3558
3559
3560
  			 * until rcu_assign_pointer(), so it's safe to increment
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3561
  			++new->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3562
3563
3564
  		}
  		j++;
  	}
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3565
  swap_buffers:
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3566
3567
3568
  	/* Swap primary and spare array */
  	thresholds->spare = thresholds->primary;
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3569

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3570
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3571
  	synchronize_rcu();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3572
  	mutex_unlock(&memcg->thresholds_lock);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3573
  }
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3574

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
  static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
  	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  	struct mem_cgroup_eventfd_list *event;
  	int type = MEMFILE_TYPE(cft->private);
  
  	BUG_ON(type != _OOM_TYPE);
  	event = kmalloc(sizeof(*event),	GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
  
  	mutex_lock(&memcg_oom_mutex);
  
  	event->eventfd = eventfd;
  	list_add(&event->list, &memcg->oom_notify);
  
  	/* already in OOM ? */
  	if (atomic_read(&memcg->oom_lock))
  		eventfd_signal(eventfd, 1);
  	mutex_unlock(&memcg_oom_mutex);
  
  	return 0;
  }
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3599
  static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
  	struct cftype *cft, struct eventfd_ctx *eventfd)
  {
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
  	struct mem_cgroup_eventfd_list *ev, *tmp;
  	int type = MEMFILE_TYPE(cft->private);
  
  	BUG_ON(type != _OOM_TYPE);
  
  	mutex_lock(&memcg_oom_mutex);
  
  	list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
  		if (ev->eventfd == eventfd) {
  			list_del(&ev->list);
  			kfree(ev);
  		}
  	}
  
  	mutex_unlock(&memcg_oom_mutex);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3618
  }
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
  static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
  	struct cftype *cft,  struct cgroup_map_cb *cb)
  {
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
  
  	cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
  
  	if (atomic_read(&mem->oom_lock))
  		cb->fill(cb, "under_oom", 1);
  	else
  		cb->fill(cb, "under_oom", 0);
  	return 0;
  }
  
  /*
   */
  static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
  	struct cftype *cft, u64 val)
  {
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
  	struct mem_cgroup *parent;
  
  	/* cannot set to root cgroup and only 0 and 1 are allowed */
  	if (!cgrp->parent || !((val == 0) || (val == 1)))
  		return -EINVAL;
  
  	parent = mem_cgroup_from_cont(cgrp->parent);
  
  	cgroup_lock();
  	/* oom-kill-disable is a flag for subhierarchy. */
  	if ((parent->use_hierarchy) ||
  	    (mem->use_hierarchy && !list_empty(&cgrp->children))) {
  		cgroup_unlock();
  		return -EINVAL;
  	}
  	mem->oom_kill_disable = val;
4d845ebf4   KAMEZAWA Hiroyuki   memcg: fix wake u...
3655
3656
  	if (!val)
  		memcg_oom_recover(mem);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3657
3658
3659
  	cgroup_unlock();
  	return 0;
  }
8cdea7c05   Balbir Singh   Memory controller...
3660
3661
  static struct cftype mem_cgroup_files[] = {
  	{
0eea10301   Balbir Singh   Memory controller...
3662
  		.name = "usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3663
  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
2c3daa722   Paul Menage   CGroup API files:...
3664
  		.read_u64 = mem_cgroup_read,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3665
3666
  		.register_event = mem_cgroup_usage_register_event,
  		.unregister_event = mem_cgroup_usage_unregister_event,
8cdea7c05   Balbir Singh   Memory controller...
3667
3668
  	},
  	{
c84872e16   Pavel Emelyanov   memcgroup: add th...
3669
  		.name = "max_usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3670
  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3671
  		.trigger = mem_cgroup_reset,
c84872e16   Pavel Emelyanov   memcgroup: add th...
3672
3673
3674
  		.read_u64 = mem_cgroup_read,
  	},
  	{
0eea10301   Balbir Singh   Memory controller...
3675
  		.name = "limit_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3676
  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
856c13aa1   Paul Menage   cgroup files: con...
3677
  		.write_string = mem_cgroup_write,
2c3daa722   Paul Menage   CGroup API files:...
3678
  		.read_u64 = mem_cgroup_read,
8cdea7c05   Balbir Singh   Memory controller...
3679
3680
  	},
  	{
296c81d89   Balbir Singh   memory controller...
3681
3682
3683
3684
3685
3686
  		.name = "soft_limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
  		.write_string = mem_cgroup_write,
  		.read_u64 = mem_cgroup_read,
  	},
  	{
8cdea7c05   Balbir Singh   Memory controller...
3687
  		.name = "failcnt",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3688
  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3689
  		.trigger = mem_cgroup_reset,
2c3daa722   Paul Menage   CGroup API files:...
3690
  		.read_u64 = mem_cgroup_read,
8cdea7c05   Balbir Singh   Memory controller...
3691
  	},
8697d3319   Balbir Singh   Memory controller...
3692
  	{
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3693
  		.name = "stat",
c64745cf0   Paul Menage   CGroup API files:...
3694
  		.read_map = mem_control_stat_show,
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3695
  	},
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3696
3697
3698
3699
  	{
  		.name = "force_empty",
  		.trigger = mem_cgroup_force_empty_write,
  	},
18f59ea7d   Balbir Singh   memcg: memory cgr...
3700
3701
3702
3703
3704
  	{
  		.name = "use_hierarchy",
  		.write_u64 = mem_cgroup_hierarchy_write,
  		.read_u64 = mem_cgroup_hierarchy_read,
  	},
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3705
3706
3707
3708
3709
  	{
  		.name = "swappiness",
  		.read_u64 = mem_cgroup_swappiness_read,
  		.write_u64 = mem_cgroup_swappiness_write,
  	},
7dc74be03   Daisuke Nishimura   memcg: add interf...
3710
3711
3712
3713
3714
  	{
  		.name = "move_charge_at_immigrate",
  		.read_u64 = mem_cgroup_move_charge_read,
  		.write_u64 = mem_cgroup_move_charge_write,
  	},
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3715
3716
  	{
  		.name = "oom_control",
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3717
3718
  		.read_map = mem_cgroup_oom_control_read,
  		.write_u64 = mem_cgroup_oom_control_write,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3719
3720
3721
3722
  		.register_event = mem_cgroup_oom_register_event,
  		.unregister_event = mem_cgroup_oom_unregister_event,
  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  	},
8cdea7c05   Balbir Singh   Memory controller...
3723
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3724
3725
3726
3727
3728
3729
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  static struct cftype memsw_cgroup_files[] = {
  	{
  		.name = "memsw.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
  		.read_u64 = mem_cgroup_read,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3730
3731
  		.register_event = mem_cgroup_usage_register_event,
  		.unregister_event = mem_cgroup_usage_unregister_event,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
  	},
  	{
  		.name = "memsw.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
  		.trigger = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read,
  	},
  	{
  		.name = "memsw.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
  		.write_string = mem_cgroup_write,
  		.read_u64 = mem_cgroup_read,
  	},
  	{
  		.name = "memsw.failcnt",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
  		.trigger = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read,
  	},
  };
  
  static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
  {
  	if (!do_swap_account)
  		return 0;
  	return cgroup_add_files(cont, ss, memsw_cgroup_files,
  				ARRAY_SIZE(memsw_cgroup_files));
  };
  #else
  static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
  {
  	return 0;
  }
  #endif
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3766
3767
3768
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
  {
  	struct mem_cgroup_per_node *pn;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3769
  	struct mem_cgroup_per_zone *mz;
b69408e88   Christoph Lameter   vmscan: Use an in...
3770
  	enum lru_list l;
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
3771
  	int zone, tmp = node;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3772
3773
3774
3775
3776
3777
3778
3779
  	/*
  	 * This routine is called against possible nodes.
  	 * But it's BUG to call kmalloc() against offline node.
  	 *
  	 * TODO: this routine can waste much memory for nodes which will
  	 *       never be onlined. It's better to use memory hotplug callback
  	 *       function.
  	 */
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
3780
3781
3782
  	if (!node_state(node, N_NORMAL_MEMORY))
  		tmp = -1;
  	pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3783
3784
  	if (!pn)
  		return 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3785

6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3786
3787
  	mem->info.nodeinfo[node] = pn;
  	memset(pn, 0, sizeof(*pn));
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3788
3789
3790
  
  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  		mz = &pn->zoneinfo[zone];
b69408e88   Christoph Lameter   vmscan: Use an in...
3791
3792
  		for_each_lru(l)
  			INIT_LIST_HEAD(&mz->lists[l]);
f64c3f549   Balbir Singh   memory controller...
3793
  		mz->usage_in_excess = 0;
4e4169535   Balbir Singh   memory controller...
3794
3795
  		mz->on_tree = false;
  		mz->mem = mem;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3796
  	}
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3797
3798
  	return 0;
  }
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3799
3800
3801
3802
  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
  {
  	kfree(mem->info.nodeinfo[node]);
  }
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3803
3804
3805
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
  	struct mem_cgroup *mem;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3806
  	int size = sizeof(struct mem_cgroup);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3807

c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3808
  	/* Can be very big if MAX_NUMNODES is very big */
c8dad2bb6   Jan Blunck   memcg: reduce siz...
3809
3810
  	if (size < PAGE_SIZE)
  		mem = kmalloc(size, GFP_KERNEL);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3811
  	else
c8dad2bb6   Jan Blunck   memcg: reduce siz...
3812
  		mem = vmalloc(size);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3813

e7bbcdf37   Dan Carpenter   memcontrol: fix p...
3814
3815
3816
3817
  	if (!mem)
  		return NULL;
  
  	memset(mem, 0, size);
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3818
3819
3820
3821
3822
3823
3824
3825
  	mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
  	if (!mem->stat) {
  		if (size < PAGE_SIZE)
  			kfree(mem);
  		else
  			vfree(mem);
  		mem = NULL;
  	}
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3826
3827
  	return mem;
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3828
3829
3830
3831
3832
3833
3834
3835
  /*
   * At destroying mem_cgroup, references from swap_cgroup can remain.
   * (scanning all at force_empty is too costly...)
   *
   * Instead of clearing all references at force_empty, we remember
   * the number of reference from swap_cgroup and free mem_cgroup when
   * it goes down to 0.
   *
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3836
3837
   * Removal of cgroup itself succeeds regardless of refs from swap.
   */
a7ba0eef3   KAMEZAWA Hiroyuki   memcg: fix double...
3838
  static void __mem_cgroup_free(struct mem_cgroup *mem)
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3839
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3840
  	int node;
f64c3f549   Balbir Singh   memory controller...
3841
  	mem_cgroup_remove_from_trees(mem);
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
3842
  	free_css_id(&mem_cgroup_subsys, &mem->css);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3843
3844
  	for_each_node_state(node, N_POSSIBLE)
  		free_mem_cgroup_per_zone_info(mem, node);
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
3845
3846
  	free_percpu(mem->stat);
  	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3847
3848
3849
3850
  		kfree(mem);
  	else
  		vfree(mem);
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3851
3852
3853
3854
  static void mem_cgroup_get(struct mem_cgroup *mem)
  {
  	atomic_inc(&mem->refcnt);
  }
483c30b51   Daisuke Nishimura   memcg: improve pe...
3855
  static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3856
  {
483c30b51   Daisuke Nishimura   memcg: improve pe...
3857
  	if (atomic_sub_and_test(count, &mem->refcnt)) {
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
3858
  		struct mem_cgroup *parent = parent_mem_cgroup(mem);
a7ba0eef3   KAMEZAWA Hiroyuki   memcg: fix double...
3859
  		__mem_cgroup_free(mem);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
3860
3861
3862
  		if (parent)
  			mem_cgroup_put(parent);
  	}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3863
  }
483c30b51   Daisuke Nishimura   memcg: improve pe...
3864
3865
3866
3867
  static void mem_cgroup_put(struct mem_cgroup *mem)
  {
  	__mem_cgroup_put(mem, 1);
  }
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
3868
3869
3870
3871
3872
3873
3874
3875
3876
  /*
   * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
   */
  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
  {
  	if (!mem->res.parent)
  		return NULL;
  	return mem_cgroup_from_res_counter(mem->res.parent, res);
  }
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
3877

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3878
3879
3880
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  static void __init enable_swap_cgroup(void)
  {
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
3881
  	if (!mem_cgroup_disabled() && really_do_swap_account)
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3882
3883
3884
3885
3886
3887
3888
  		do_swap_account = 1;
  }
  #else
  static void __init enable_swap_cgroup(void)
  {
  }
  #endif
f64c3f549   Balbir Singh   memory controller...
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
  static int mem_cgroup_soft_limit_tree_init(void)
  {
  	struct mem_cgroup_tree_per_node *rtpn;
  	struct mem_cgroup_tree_per_zone *rtpz;
  	int tmp, node, zone;
  
  	for_each_node_state(node, N_POSSIBLE) {
  		tmp = node;
  		if (!node_state(node, N_NORMAL_MEMORY))
  			tmp = -1;
  		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
  		if (!rtpn)
  			return 1;
  
  		soft_limit_tree.rb_tree_per_node[node] = rtpn;
  
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  			rtpz = &rtpn->rb_tree_per_zone[zone];
  			rtpz->rb_root = RB_ROOT;
  			spin_lock_init(&rtpz->lock);
  		}
  	}
  	return 0;
  }
0eb253e22   Li Zefan   memcg: fix sectio...
3913
  static struct cgroup_subsys_state * __ref
8cdea7c05   Balbir Singh   Memory controller...
3914
3915
  mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
  {
28dbc4b6a   Balbir Singh   memcg: memory cgr...
3916
  	struct mem_cgroup *mem, *parent;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
3917
  	long error = -ENOMEM;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3918
  	int node;
8cdea7c05   Balbir Singh   Memory controller...
3919

c8dad2bb6   Jan Blunck   memcg: reduce siz...
3920
3921
  	mem = mem_cgroup_alloc();
  	if (!mem)
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
3922
  		return ERR_PTR(error);
78fb74669   Pavel Emelianov   Memory controller...
3923

6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3924
3925
3926
  	for_each_node_state(node, N_POSSIBLE)
  		if (alloc_mem_cgroup_per_zone_info(mem, node))
  			goto free_out;
f64c3f549   Balbir Singh   memory controller...
3927

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3928
  	/* root ? */
28dbc4b6a   Balbir Singh   memcg: memory cgr...
3929
  	if (cont->parent == NULL) {
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
3930
  		int cpu;
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3931
  		enable_swap_cgroup();
28dbc4b6a   Balbir Singh   memcg: memory cgr...
3932
  		parent = NULL;
4b3bde4c9   Balbir Singh   memcg: remove the...
3933
  		root_mem_cgroup = mem;
f64c3f549   Balbir Singh   memory controller...
3934
3935
  		if (mem_cgroup_soft_limit_tree_init())
  			goto free_out;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
3936
3937
3938
3939
3940
3941
  		for_each_possible_cpu(cpu) {
  			struct memcg_stock_pcp *stock =
  						&per_cpu(memcg_stock, cpu);
  			INIT_WORK(&stock->work, drain_local_stock);
  		}
  		hotcpu_notifier(memcg_stock_cpu_callback, 0);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3942
  	} else {
28dbc4b6a   Balbir Singh   memcg: memory cgr...
3943
  		parent = mem_cgroup_from_cont(cont->parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3944
  		mem->use_hierarchy = parent->use_hierarchy;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3945
  		mem->oom_kill_disable = parent->oom_kill_disable;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3946
  	}
28dbc4b6a   Balbir Singh   memcg: memory cgr...
3947

18f59ea7d   Balbir Singh   memcg: memory cgr...
3948
3949
3950
  	if (parent && parent->use_hierarchy) {
  		res_counter_init(&mem->res, &parent->res);
  		res_counter_init(&mem->memsw, &parent->memsw);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
3951
3952
3953
3954
3955
3956
3957
  		/*
  		 * We increment refcnt of the parent to ensure that we can
  		 * safely access it on res_counter_charge/uncharge.
  		 * This refcnt will be decremented when freeing this
  		 * mem_cgroup(see mem_cgroup_put).
  		 */
  		mem_cgroup_get(parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3958
3959
3960
3961
  	} else {
  		res_counter_init(&mem->res, NULL);
  		res_counter_init(&mem->memsw, NULL);
  	}
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
3962
  	mem->last_scanned_child = 0;
2733c06ac   KOSAKI Motohiro   memcg: protect pr...
3963
  	spin_lock_init(&mem->reclaim_param_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3964
  	INIT_LIST_HEAD(&mem->oom_notify);
6d61ef409   Balbir Singh   memcg: memory cgr...
3965

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3966
3967
  	if (parent)
  		mem->swappiness = get_swappiness(parent);
a7ba0eef3   KAMEZAWA Hiroyuki   memcg: fix double...
3968
  	atomic_set(&mem->refcnt, 1);
7dc74be03   Daisuke Nishimura   memcg: add interf...
3969
  	mem->move_charge_at_immigrate = 0;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3970
  	mutex_init(&mem->thresholds_lock);
8cdea7c05   Balbir Singh   Memory controller...
3971
  	return &mem->css;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3972
  free_out:
a7ba0eef3   KAMEZAWA Hiroyuki   memcg: fix double...
3973
  	__mem_cgroup_free(mem);
4b3bde4c9   Balbir Singh   memcg: remove the...
3974
  	root_mem_cgroup = NULL;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
3975
  	return ERR_PTR(error);
8cdea7c05   Balbir Singh   Memory controller...
3976
  }
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3977
  static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
3978
3979
3980
  					struct cgroup *cont)
  {
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
3981
3982
  
  	return mem_cgroup_force_empty(mem, false);
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
3983
  }
8cdea7c05   Balbir Singh   Memory controller...
3984
3985
3986
  static void mem_cgroup_destroy(struct cgroup_subsys *ss,
  				struct cgroup *cont)
  {
c268e9946   Daisuke Nishimura   memcg: fix hierar...
3987
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
c268e9946   Daisuke Nishimura   memcg: fix hierar...
3988

c268e9946   Daisuke Nishimura   memcg: fix hierar...
3989
  	mem_cgroup_put(mem);
8cdea7c05   Balbir Singh   Memory controller...
3990
3991
3992
3993
3994
  }
  
  static int mem_cgroup_populate(struct cgroup_subsys *ss,
  				struct cgroup *cont)
  {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3995
3996
3997
3998
3999
4000
4001
4002
  	int ret;
  
  	ret = cgroup_add_files(cont, ss, mem_cgroup_files,
  				ARRAY_SIZE(mem_cgroup_files));
  
  	if (!ret)
  		ret = register_memsw_files(cont, ss);
  	return ret;
8cdea7c05   Balbir Singh   Memory controller...
4003
  }
024914477   Daisuke Nishimura   memcg: move charg...
4004
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
4005
  /* Handlers for move charge at task migration. */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4006
4007
  #define PRECHARGE_COUNT_AT_ONCE	256
  static int mem_cgroup_do_precharge(unsigned long count)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4008
  {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4009
4010
  	int ret = 0;
  	int batch_count = PRECHARGE_COUNT_AT_ONCE;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4011
  	struct mem_cgroup *mem = mc.to;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
  	if (mem_cgroup_is_root(mem)) {
  		mc.precharge += count;
  		/* we don't need css_get for root */
  		return ret;
  	}
  	/* try to charge at once */
  	if (count > 1) {
  		struct res_counter *dummy;
  		/*
  		 * "mem" cannot be under rmdir() because we've already checked
  		 * by cgroup_lock_live_cgroup() that it is not removed and we
  		 * are still under the same cgroup_mutex. So we can postpone
  		 * css_get().
  		 */
  		if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
  			goto one_by_one;
  		if (do_swap_account && res_counter_charge(&mem->memsw,
  						PAGE_SIZE * count, &dummy)) {
  			res_counter_uncharge(&mem->res, PAGE_SIZE * count);
  			goto one_by_one;
  		}
  		mc.precharge += count;
  		VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
  		WARN_ON_ONCE(count > INT_MAX);
  		__css_get(&mem->css, (int)count);
  		return ret;
  	}
  one_by_one:
  	/* fall back to one by one charge */
  	while (count--) {
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
  		if (!batch_count--) {
  			batch_count = PRECHARGE_COUNT_AT_ONCE;
  			cond_resched();
  		}
430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
4050
  		ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4051
4052
4053
4054
4055
  		if (ret || !mem)
  			/* mem_cgroup_clear_mc() will do uncharge later */
  			return -ENOMEM;
  		mc.precharge++;
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4056
4057
4058
4059
4060
4061
4062
4063
  	return ret;
  }
  
  /**
   * is_target_pte_for_mc - check a pte whether it is valid for move charge
   * @vma: the vma the pte to be checked belongs
   * @addr: the address corresponding to the pte to be checked
   * @ptent: the pte to be checked
024914477   Daisuke Nishimura   memcg: move charg...
4064
   * @target: the pointer the target page or swap ent will be stored(can be NULL)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4065
4066
4067
4068
4069
4070
   *
   * Returns
   *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
   *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
   *     move charge. if @target is not NULL, the page is stored in target->page
   *     with extra refcnt got(Callers should handle it).
024914477   Daisuke Nishimura   memcg: move charg...
4071
4072
4073
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4074
4075
4076
   *
   * Called with pte lock held.
   */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4077
4078
  union mc_target {
  	struct page	*page;
024914477   Daisuke Nishimura   memcg: move charg...
4079
  	swp_entry_t	ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4080
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4081
4082
4083
  enum mc_target_type {
  	MC_TARGET_NONE,	/* not used */
  	MC_TARGET_PAGE,
024914477   Daisuke Nishimura   memcg: move charg...
4084
  	MC_TARGET_SWAP,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4085
  };
90254a658   Daisuke Nishimura   memcg: clean up m...
4086
4087
  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
  						unsigned long addr, pte_t ptent)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4088
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4089
  	struct page *page = vm_normal_page(vma, addr, ptent);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4090

90254a658   Daisuke Nishimura   memcg: clean up m...
4091
4092
4093
4094
4095
4096
  	if (!page || !page_mapped(page))
  		return NULL;
  	if (PageAnon(page)) {
  		/* we don't move shared anon */
  		if (!move_anon() || page_mapcount(page) > 2)
  			return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4097
4098
  	} else if (!move_file())
  		/* we ignore mapcount for file pages */
90254a658   Daisuke Nishimura   memcg: clean up m...
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
  		return NULL;
  	if (!get_page_unless_zero(page))
  		return NULL;
  
  	return page;
  }
  
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	int usage_count;
  	struct page *page = NULL;
  	swp_entry_t ent = pte_to_swp_entry(ptent);
  
  	if (!move_anon() || non_swap_entry(ent))
  		return NULL;
  	usage_count = mem_cgroup_count_swap_user(ent, &page);
  	if (usage_count > 1) { /* we don't move shared anon */
024914477   Daisuke Nishimura   memcg: move charg...
4117
4118
  		if (page)
  			put_page(page);
90254a658   Daisuke Nishimura   memcg: clean up m...
4119
  		return NULL;
024914477   Daisuke Nishimura   memcg: move charg...
4120
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4121
4122
4123
4124
4125
  	if (do_swap_account)
  		entry->val = ent.val;
  
  	return page;
  }
87946a722   Daisuke Nishimura   memcg: move charg...
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
  static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	struct page *page = NULL;
  	struct inode *inode;
  	struct address_space *mapping;
  	pgoff_t pgoff;
  
  	if (!vma->vm_file) /* anonymous vma */
  		return NULL;
  	if (!move_file())
  		return NULL;
  
  	inode = vma->vm_file->f_path.dentry->d_inode;
  	mapping = vma->vm_file->f_mapping;
  	if (pte_none(ptent))
  		pgoff = linear_page_index(vma, addr);
  	else /* pte_file(ptent) is true */
  		pgoff = pte_to_pgoff(ptent);
  
  	/* page is moved even if it's not RSS of this task(page-faulted). */
  	if (!mapping_cap_swap_backed(mapping)) { /* normal file */
  		page = find_get_page(mapping, pgoff);
  	} else { /* shmem/tmpfs file. we should take account of swap too. */
  		swp_entry_t ent;
  		mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
  		if (do_swap_account)
  			entry->val = ent.val;
  	}
  
  	return page;
  }
90254a658   Daisuke Nishimura   memcg: clean up m...
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
  static int is_target_pte_for_mc(struct vm_area_struct *vma,
  		unsigned long addr, pte_t ptent, union mc_target *target)
  {
  	struct page *page = NULL;
  	struct page_cgroup *pc;
  	int ret = 0;
  	swp_entry_t ent = { .val = 0 };
  
  	if (pte_present(ptent))
  		page = mc_handle_present_pte(vma, addr, ptent);
  	else if (is_swap_pte(ptent))
  		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
87946a722   Daisuke Nishimura   memcg: move charg...
4170
4171
  	else if (pte_none(ptent) || pte_file(ptent))
  		page = mc_handle_file_pte(vma, addr, ptent, &ent);
90254a658   Daisuke Nishimura   memcg: clean up m...
4172
4173
4174
  
  	if (!page && !ent.val)
  		return 0;
024914477   Daisuke Nishimura   memcg: move charg...
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
  	if (page) {
  		pc = lookup_page_cgroup(page);
  		/*
  		 * Do only loose check w/o page_cgroup lock.
  		 * mem_cgroup_move_account() checks the pc is valid or not under
  		 * the lock.
  		 */
  		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
  			ret = MC_TARGET_PAGE;
  			if (target)
  				target->page = page;
  		}
  		if (!ret || !target)
  			put_page(page);
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4190
4191
  	/* There is a swap entry and a page doesn't exist or isn't charged */
  	if (ent.val && !ret &&
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4192
4193
4194
4195
  			css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
  		ret = MC_TARGET_SWAP;
  		if (target)
  			target->ent = ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4196
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
  	return ret;
  }
  
  static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  					unsigned long addr, unsigned long end,
  					struct mm_walk *walk)
  {
  	struct vm_area_struct *vma = walk->private;
  	pte_t *pte;
  	spinlock_t *ptl;
  
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; pte++, addr += PAGE_SIZE)
  		if (is_target_pte_for_mc(vma, addr, *pte, NULL))
  			mc.precharge++;	/* increment precharge temporarily */
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4214
4215
  	return 0;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
  	unsigned long precharge;
  	struct vm_area_struct *vma;
  
  	down_read(&mm->mmap_sem);
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  		struct mm_walk mem_cgroup_count_precharge_walk = {
  			.pmd_entry = mem_cgroup_count_precharge_pte_range,
  			.mm = mm,
  			.private = vma,
  		};
  		if (is_vm_hugetlb_page(vma))
  			continue;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
  		walk_page_range(vma->vm_start, vma->vm_end,
  					&mem_cgroup_count_precharge_walk);
  	}
  	up_read(&mm->mmap_sem);
  
  	precharge = mc.precharge;
  	mc.precharge = 0;
  
  	return precharge;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4240
4241
  static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4242
  	return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4243
4244
4245
4246
4247
  }
  
  static void mem_cgroup_clear_mc(void)
  {
  	/* we must uncharge all the leftover precharges from mc.to */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4248
4249
4250
  	if (mc.precharge) {
  		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
  		mc.precharge = 0;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4251
  		memcg_oom_recover(mc.to);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4252
4253
4254
4255
4256
4257
4258
4259
  	}
  	/*
  	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
  	 * we must uncharge here.
  	 */
  	if (mc.moved_charge) {
  		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
  		mc.moved_charge = 0;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4260
  		memcg_oom_recover(mc.from);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4261
  	}
483c30b51   Daisuke Nishimura   memcg: improve pe...
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
  	/* we must fixup refcnts and charges */
  	if (mc.moved_swap) {
  		WARN_ON_ONCE(mc.moved_swap > INT_MAX);
  		/* uncharge swap account from the old cgroup */
  		if (!mem_cgroup_is_root(mc.from))
  			res_counter_uncharge(&mc.from->memsw,
  						PAGE_SIZE * mc.moved_swap);
  		__mem_cgroup_put(mc.from, mc.moved_swap);
  
  		if (!mem_cgroup_is_root(mc.to)) {
  			/*
  			 * we charged both to->res and to->memsw, so we should
  			 * uncharge to->res.
  			 */
  			res_counter_uncharge(&mc.to->res,
  						PAGE_SIZE * mc.moved_swap);
  			VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
  			__css_put(&mc.to->css, mc.moved_swap);
  		}
  		/* we've already done mem_cgroup_get(mc.to) */
  
  		mc.moved_swap = 0;
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4285
4286
  	mc.from = NULL;
  	mc.to = NULL;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
4287
4288
  	mc.moving_task = NULL;
  	wake_up_all(&mc.waitq);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4289
  }
7dc74be03   Daisuke Nishimura   memcg: add interf...
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
  static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
  				struct cgroup *cgroup,
  				struct task_struct *p,
  				bool threadgroup)
  {
  	int ret = 0;
  	struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
  
  	if (mem->move_charge_at_immigrate) {
  		struct mm_struct *mm;
  		struct mem_cgroup *from = mem_cgroup_from_task(p);
  
  		VM_BUG_ON(from == mem);
  
  		mm = get_task_mm(p);
  		if (!mm)
  			return 0;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4307
  		/* We move charges only when we move a owner of the mm */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4308
4309
4310
4311
  		if (mm->owner == p) {
  			VM_BUG_ON(mc.from);
  			VM_BUG_ON(mc.to);
  			VM_BUG_ON(mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4312
  			VM_BUG_ON(mc.moved_charge);
483c30b51   Daisuke Nishimura   memcg: improve pe...
4313
  			VM_BUG_ON(mc.moved_swap);
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
4314
  			VM_BUG_ON(mc.moving_task);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4315
4316
4317
  			mc.from = from;
  			mc.to = mem;
  			mc.precharge = 0;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4318
  			mc.moved_charge = 0;
483c30b51   Daisuke Nishimura   memcg: improve pe...
4319
  			mc.moved_swap = 0;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
4320
  			mc.moving_task = current;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4321
4322
4323
4324
4325
  
  			ret = mem_cgroup_precharge_mc(mm);
  			if (ret)
  				mem_cgroup_clear_mc();
  		}
7dc74be03   Daisuke Nishimura   memcg: add interf...
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
  		mmput(mm);
  	}
  	return ret;
  }
  
  static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
  				struct cgroup *cgroup,
  				struct task_struct *p,
  				bool threadgroup)
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4336
  	mem_cgroup_clear_mc();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4337
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4338
4339
4340
  static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4341
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
  	int ret = 0;
  	struct vm_area_struct *vma = walk->private;
  	pte_t *pte;
  	spinlock_t *ptl;
  
  retry:
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; addr += PAGE_SIZE) {
  		pte_t ptent = *(pte++);
  		union mc_target target;
  		int type;
  		struct page *page;
  		struct page_cgroup *pc;
024914477   Daisuke Nishimura   memcg: move charg...
4355
  		swp_entry_t ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
  
  		if (!mc.precharge)
  			break;
  
  		type = is_target_pte_for_mc(vma, addr, ptent, &target);
  		switch (type) {
  		case MC_TARGET_PAGE:
  			page = target.page;
  			if (isolate_lru_page(page))
  				goto put;
  			pc = lookup_page_cgroup(page);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4367
4368
  			if (!mem_cgroup_move_account(pc,
  						mc.from, mc.to, false)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4369
  				mc.precharge--;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4370
4371
  				/* we uncharge from mc.from later. */
  				mc.moved_charge++;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4372
4373
4374
4375
4376
  			}
  			putback_lru_page(page);
  put:			/* is_target_pte_for_mc() gets the page */
  			put_page(page);
  			break;
024914477   Daisuke Nishimura   memcg: move charg...
4377
4378
  		case MC_TARGET_SWAP:
  			ent = target.ent;
483c30b51   Daisuke Nishimura   memcg: improve pe...
4379
4380
  			if (!mem_cgroup_move_swap_account(ent,
  						mc.from, mc.to, false)) {
024914477   Daisuke Nishimura   memcg: move charg...
4381
  				mc.precharge--;
483c30b51   Daisuke Nishimura   memcg: improve pe...
4382
4383
4384
  				/* we fixup refcnts and charges later. */
  				mc.moved_swap++;
  			}
024914477   Daisuke Nishimura   memcg: move charg...
4385
  			break;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
  		default:
  			break;
  		}
  	}
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
  
  	if (addr != end) {
  		/*
  		 * We have consumed all precharges we got in can_attach().
  		 * We try charge one by one, but don't do any additional
  		 * charges to mc.to if we have failed in charge once in attach()
  		 * phase.
  		 */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4400
  		ret = mem_cgroup_do_precharge(1);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
  		if (!ret)
  			goto retry;
  	}
  
  	return ret;
  }
  
  static void mem_cgroup_move_charge(struct mm_struct *mm)
  {
  	struct vm_area_struct *vma;
  
  	lru_add_drain_all();
  	down_read(&mm->mmap_sem);
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  		int ret;
  		struct mm_walk mem_cgroup_move_charge_walk = {
  			.pmd_entry = mem_cgroup_move_charge_pte_range,
  			.mm = mm,
  			.private = vma,
  		};
  		if (is_vm_hugetlb_page(vma))
  			continue;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
  		ret = walk_page_range(vma->vm_start, vma->vm_end,
  						&mem_cgroup_move_charge_walk);
  		if (ret)
  			/*
  			 * means we have consumed all precharges and failed in
  			 * doing additional charge. Just abandon here.
  			 */
  			break;
  	}
  	up_read(&mm->mmap_sem);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4433
  }
67e465a77   Balbir Singh   Memory controller...
4434
4435
4436
  static void mem_cgroup_move_task(struct cgroup_subsys *ss,
  				struct cgroup *cont,
  				struct cgroup *old_cont,
be367d099   Ben Blum   cgroups: let ss->...
4437
4438
  				struct task_struct *p,
  				bool threadgroup)
67e465a77   Balbir Singh   Memory controller...
4439
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
  	struct mm_struct *mm;
  
  	if (!mc.to)
  		/* no need to move charge */
  		return;
  
  	mm = get_task_mm(p);
  	if (mm) {
  		mem_cgroup_move_charge(mm);
  		mmput(mm);
  	}
  	mem_cgroup_clear_mc();
67e465a77   Balbir Singh   Memory controller...
4452
  }
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
  #else	/* !CONFIG_MMU */
  static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
  				struct cgroup *cgroup,
  				struct task_struct *p,
  				bool threadgroup)
  {
  	return 0;
  }
  static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
  				struct cgroup *cgroup,
  				struct task_struct *p,
  				bool threadgroup)
  {
  }
  static void mem_cgroup_move_task(struct cgroup_subsys *ss,
  				struct cgroup *cont,
  				struct cgroup *old_cont,
  				struct task_struct *p,
  				bool threadgroup)
  {
  }
  #endif
67e465a77   Balbir Singh   Memory controller...
4475

8cdea7c05   Balbir Singh   Memory controller...
4476
4477
4478
4479
  struct cgroup_subsys mem_cgroup_subsys = {
  	.name = "memory",
  	.subsys_id = mem_cgroup_subsys_id,
  	.create = mem_cgroup_create,
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4480
  	.pre_destroy = mem_cgroup_pre_destroy,
8cdea7c05   Balbir Singh   Memory controller...
4481
4482
  	.destroy = mem_cgroup_destroy,
  	.populate = mem_cgroup_populate,
7dc74be03   Daisuke Nishimura   memcg: add interf...
4483
4484
  	.can_attach = mem_cgroup_can_attach,
  	.cancel_attach = mem_cgroup_cancel_attach,
67e465a77   Balbir Singh   Memory controller...
4485
  	.attach = mem_cgroup_move_task,
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4486
  	.early_init = 0,
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4487
  	.use_id = 1,
8cdea7c05   Balbir Singh   Memory controller...
4488
  };
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
  
  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  
  static int __init disable_swap_account(char *s)
  {
  	really_do_swap_account = 0;
  	return 1;
  }
  __setup("noswapaccount", disable_swap_account);
  #endif