Blame view

mm/memcontrol.c 145 KB
8cdea7c05   Balbir Singh   Memory controller...
1
2
3
4
5
  /* memcontrol.c - Memory Controller
   *
   * Copyright IBM Corporation, 2007
   * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   *
78fb74669   Pavel Emelianov   Memory controller...
6
7
8
   * Copyright 2007 OpenVZ SWsoft Inc
   * Author: Pavel Emelianov <xemul@openvz.org>
   *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
9
10
11
12
   * Memory thresholds
   * Copyright (C) 2009 Nokia Corporation
   * Author: Kirill A. Shutemov
   *
8cdea7c05   Balbir Singh   Memory controller...
13
14
15
16
17
18
19
20
21
22
23
24
25
26
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   */
  
  #include <linux/res_counter.h>
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
78fb74669   Pavel Emelianov   Memory controller...
27
  #include <linux/mm.h>
4ffef5fef   Daisuke Nishimura   memcg: move charg...
28
  #include <linux/hugetlb.h>
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
29
  #include <linux/pagemap.h>
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
30
  #include <linux/smp.h>
8a9f3ccd2   Balbir Singh   Memory controller...
31
  #include <linux/page-flags.h>
66e1707bc   Balbir Singh   Memory controller...
32
  #include <linux/backing-dev.h>
8a9f3ccd2   Balbir Singh   Memory controller...
33
34
  #include <linux/bit_spinlock.h>
  #include <linux/rcupdate.h>
e222432bf   Balbir Singh   memcg: show memcg...
35
  #include <linux/limits.h>
b9e15bafd   Paul Gortmaker   mm: Add export.h ...
36
  #include <linux/export.h>
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
37
  #include <linux/mutex.h>
f64c3f549   Balbir Singh   memory controller...
38
  #include <linux/rbtree.h>
b6ac57d50   Balbir Singh   memcgroup: move m...
39
  #include <linux/slab.h>
66e1707bc   Balbir Singh   Memory controller...
40
  #include <linux/swap.h>
024914477   Daisuke Nishimura   memcg: move charg...
41
  #include <linux/swapops.h>
66e1707bc   Balbir Singh   Memory controller...
42
  #include <linux/spinlock.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
43
44
  #include <linux/eventfd.h>
  #include <linux/sort.h>
66e1707bc   Balbir Singh   Memory controller...
45
  #include <linux/fs.h>
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
46
  #include <linux/seq_file.h>
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
47
  #include <linux/vmalloc.h>
b69408e88   Christoph Lameter   vmscan: Use an in...
48
  #include <linux/mm_inline.h>
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
49
  #include <linux/page_cgroup.h>
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
50
  #include <linux/cpu.h>
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
51
  #include <linux/oom.h>
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
52
  #include "internal.h"
d1a4c0b37   Glauber Costa   tcp memory pressu...
53
54
  #include <net/sock.h>
  #include <net/tcp_memcontrol.h>
8cdea7c05   Balbir Singh   Memory controller...
55

8697d3319   Balbir Singh   Memory controller...
56
  #include <asm/uaccess.h>
cc8e970c3   KOSAKI Motohiro   memcg: add mm_vms...
57
  #include <trace/events/vmscan.h>
a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
58
  struct cgroup_subsys mem_cgroup_subsys __read_mostly;
a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
59
  #define MEM_CGROUP_RECLAIM_RETRIES	5
6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
60
  static struct mem_cgroup *root_mem_cgroup __read_mostly;
8cdea7c05   Balbir Singh   Memory controller...
61

c255a4580   Andrew Morton   memcg: rename con...
62
  #ifdef CONFIG_MEMCG_SWAP
338c84310   Li Zefan   memcg: remove som...
63
  /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
64
  int do_swap_account __read_mostly;
a42c390cf   Michal Hocko   cgroups: make swa...
65
66
  
  /* for remember boot option*/
c255a4580   Andrew Morton   memcg: rename con...
67
  #ifdef CONFIG_MEMCG_SWAP_ENABLED
a42c390cf   Michal Hocko   cgroups: make swa...
68
69
70
71
  static int really_do_swap_account __initdata = 1;
  #else
  static int really_do_swap_account __initdata = 0;
  #endif
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
72
  #else
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
73
  #define do_swap_account		0
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
74
  #endif
8cdea7c05   Balbir Singh   Memory controller...
75
  /*
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
76
77
78
79
80
81
82
   * Statistics for memory cgroup.
   */
  enum mem_cgroup_stat_index {
  	/*
  	 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
  	 */
  	MEM_CGROUP_STAT_CACHE, 	   /* # of pages charged as cache */
d69b042f3   Balbir Singh   memcg: add file-b...
83
  	MEM_CGROUP_STAT_RSS,	   /* # of pages charged as anon rss */
d8046582d   KAMEZAWA Hiroyuki   memcg: make memcg...
84
  	MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
85
  	MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
86
87
  	MEM_CGROUP_STAT_NSTATS,
  };
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
88
89
90
91
92
93
  static const char * const mem_cgroup_stat_names[] = {
  	"cache",
  	"rss",
  	"mapped_file",
  	"swap",
  };
e9f8974f2   Johannes Weiner   memcg: break out ...
94
95
96
  enum mem_cgroup_events_index {
  	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
  	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
456f998ec   Ying Han   memcg: add the pa...
97
98
  	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
  	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
e9f8974f2   Johannes Weiner   memcg: break out ...
99
100
  	MEM_CGROUP_EVENTS_NSTATS,
  };
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
101
102
103
104
105
106
107
  
  static const char * const mem_cgroup_events_names[] = {
  	"pgpgin",
  	"pgpgout",
  	"pgfault",
  	"pgmajfault",
  };
7a159cc9d   Johannes Weiner   memcg: use native...
108
109
110
111
112
113
114
115
116
  /*
   * Per memcg event counter is incremented at every pagein/pageout. With THP,
   * it will be incremated by the number of pages. This counter is used for
   * for trigger some periodic events. This is straightforward and better
   * than using jiffies etc. to handle periodic memcg event.
   */
  enum mem_cgroup_events_target {
  	MEM_CGROUP_TARGET_THRESH,
  	MEM_CGROUP_TARGET_SOFTLIMIT,
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
117
  	MEM_CGROUP_TARGET_NUMAINFO,
7a159cc9d   Johannes Weiner   memcg: use native...
118
119
  	MEM_CGROUP_NTARGETS,
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
120
121
122
  #define THRESHOLDS_EVENTS_TARGET 128
  #define SOFTLIMIT_EVENTS_TARGET 1024
  #define NUMAINFO_EVENTS_TARGET	1024
e9f8974f2   Johannes Weiner   memcg: break out ...
123

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
124
  struct mem_cgroup_stat_cpu {
7a159cc9d   Johannes Weiner   memcg: use native...
125
  	long count[MEM_CGROUP_STAT_NSTATS];
e9f8974f2   Johannes Weiner   memcg: break out ...
126
  	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
13114716c   Johannes Weiner   mm: memcg: keep r...
127
  	unsigned long nr_page_events;
7a159cc9d   Johannes Weiner   memcg: use native...
128
  	unsigned long targets[MEM_CGROUP_NTARGETS];
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
129
  };
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
130
131
132
133
134
135
  struct mem_cgroup_reclaim_iter {
  	/* css_id of the last scanned hierarchy member */
  	int position;
  	/* scan generation, increased every round-trip */
  	unsigned int generation;
  };
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
136
  /*
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
137
138
   * per-zone information in memory controller.
   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
139
  struct mem_cgroup_per_zone {
6290df545   Johannes Weiner   mm: collect LRU l...
140
  	struct lruvec		lruvec;
1eb492725   Hugh Dickins   memcg: lru_size i...
141
  	unsigned long		lru_size[NR_LRU_LISTS];
3e2f41f1f   KOSAKI Motohiro   memcg: add zone_r...
142

527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
143
  	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
f64c3f549   Balbir Singh   memory controller...
144
145
146
147
  	struct rb_node		tree_node;	/* RB tree node */
  	unsigned long long	usage_in_excess;/* Set to the value by which */
  						/* the soft limit is exceeded*/
  	bool			on_tree;
d79154bb5   Hugh Dickins   memcg: replace me...
148
  	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
4e4169535   Balbir Singh   memory controller...
149
  						/* use container_of	   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
150
  };
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
151
152
153
154
155
156
157
158
159
160
  
  struct mem_cgroup_per_node {
  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_lru_info {
  	struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
  };
  
  /*
f64c3f549   Balbir Singh   memory controller...
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
   * Cgroups above their limits are maintained in a RB-Tree, independent of
   * their hierarchy representation
   */
  
  struct mem_cgroup_tree_per_zone {
  	struct rb_root rb_root;
  	spinlock_t lock;
  };
  
  struct mem_cgroup_tree_per_node {
  	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_tree {
  	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
  };
  
  static struct mem_cgroup_tree soft_limit_tree __read_mostly;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
179
180
181
182
  struct mem_cgroup_threshold {
  	struct eventfd_ctx *eventfd;
  	u64 threshold;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
183
  /* For threshold */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
184
  struct mem_cgroup_threshold_ary {
748dad36d   Sha Zhengju   memcg: make thres...
185
  	/* An array index points to threshold just below or equal to usage. */
5407a5625   Phil Carmody   mm: remove unnece...
186
  	int current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
187
188
189
190
191
  	/* Size of entries[] */
  	unsigned int size;
  	/* Array of thresholds */
  	struct mem_cgroup_threshold entries[0];
  };
2c488db27   Kirill A. Shutemov   memcg: clean up m...
192
193
194
195
196
197
198
199
200
201
202
  
  struct mem_cgroup_thresholds {
  	/* Primary thresholds array */
  	struct mem_cgroup_threshold_ary *primary;
  	/*
  	 * Spare threshold array.
  	 * This is needed to make mem_cgroup_unregister_event() "never fail".
  	 * It must be able to store at least primary->size - 1 entries.
  	 */
  	struct mem_cgroup_threshold_ary *spare;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
203
204
205
206
207
  /* for OOM */
  struct mem_cgroup_eventfd_list {
  	struct list_head list;
  	struct eventfd_ctx *eventfd;
  };
2e72b6347   Kirill A. Shutemov   memcg: implement ...
208

c0ff4b854   Raghavendra K T   memcg: rename mem...
209
210
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
211

f64c3f549   Balbir Singh   memory controller...
212
  /*
8cdea7c05   Balbir Singh   Memory controller...
213
214
215
216
217
218
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
   * statistics based on the statistics developed by Rik Van Riel for clock-pro,
   * to help the administrator determine what knobs to tune.
   *
   * TODO: Add a water mark for the memory controller. Reclaim will begin when
8a9f3ccd2   Balbir Singh   Memory controller...
219
220
221
   * we hit the water mark. May be even add a low water mark, such that
   * no reclaim occurs from a cgroup at it's low water mark, this is
   * a feature that will be implemented much later in the future.
8cdea7c05   Balbir Singh   Memory controller...
222
223
224
225
226
227
228
   */
  struct mem_cgroup {
  	struct cgroup_subsys_state css;
  	/*
  	 * the counter to account for memory usage
  	 */
  	struct res_counter res;
59927fb98   Hugh Dickins   memcg: free mem_c...
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
  
  	union {
  		/*
  		 * the counter to account for mem+swap usage.
  		 */
  		struct res_counter memsw;
  
  		/*
  		 * rcu_freeing is used only when freeing struct mem_cgroup,
  		 * so put it into a union to avoid wasting more memory.
  		 * It must be disjoint from the css field.  It could be
  		 * in a union with the res field, but res plays a much
  		 * larger part in mem_cgroup life than memsw, and might
  		 * be of interest, even at time of free, when debugging.
  		 * So share rcu_head with the less interesting memsw.
  		 */
  		struct rcu_head rcu_freeing;
  		/*
3afe36b1f   Glauber Costa   memcg: always fre...
247
248
  		 * We also need some space for a worker in deferred freeing.
  		 * By the time we call it, rcu_freeing is no longer in use.
59927fb98   Hugh Dickins   memcg: free mem_c...
249
250
251
  		 */
  		struct work_struct work_freeing;
  	};
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
252
  	/*
78fb74669   Pavel Emelianov   Memory controller...
253
254
  	 * Per cgroup active and inactive list, similar to the
  	 * per zone LRU lists.
78fb74669   Pavel Emelianov   Memory controller...
255
  	 */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
256
  	struct mem_cgroup_lru_info info;
889976dbc   Ying Han   memcg: reclaim me...
257
258
259
  	int last_scanned_node;
  #if MAX_NUMNODES > 1
  	nodemask_t	scan_nodes;
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
260
261
  	atomic_t	numainfo_events;
  	atomic_t	numainfo_updating;
889976dbc   Ying Han   memcg: reclaim me...
262
  #endif
18f59ea7d   Balbir Singh   memcg: memory cgr...
263
264
265
266
  	/*
  	 * Should the accounting and control be hierarchical, per subtree?
  	 */
  	bool use_hierarchy;
79dfdaccd   Michal Hocko   memcg: make oom_l...
267
268
269
  
  	bool		oom_lock;
  	atomic_t	under_oom;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
270
  	atomic_t	refcnt;
14797e236   KOSAKI Motohiro   memcg: add inacti...
271

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
272
  	int	swappiness;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
273
274
  	/* OOM-Killer disable */
  	int		oom_kill_disable;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
275

22a668d7c   KAMEZAWA Hiroyuki   memcg: fix behavi...
276
277
  	/* set when res.limit == memsw.limit */
  	bool		memsw_is_minimum;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
278
279
280
281
  	/* protect arrays of thresholds */
  	struct mutex thresholds_lock;
  
  	/* thresholds for memory usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
282
  	struct mem_cgroup_thresholds thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
283

2e72b6347   Kirill A. Shutemov   memcg: implement ...
284
  	/* thresholds for mem+swap usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
285
  	struct mem_cgroup_thresholds memsw_thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
286

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
287
288
  	/* For oom notifier event fd */
  	struct list_head oom_notify;
185efc0f9   Johannes Weiner   memcg: Revert "me...
289

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
290
  	/*
7dc74be03   Daisuke Nishimura   memcg: add interf...
291
292
293
294
  	 * Should we move charges of a task when a task is moved into this
  	 * mem_cgroup ? And what type of charges should we move ?
  	 */
  	unsigned long 	move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
295
  	/*
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
296
297
298
  	 * set > 0 if pages under this cgroup are moving to other cgroup.
  	 */
  	atomic_t	moving_account;
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
299
300
  	/* taken only while moving_account > 0 */
  	spinlock_t	move_lock;
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
301
  	/*
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
302
  	 * percpu counter.
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
303
  	 */
3a7951b4c   Kirill A. Shutemov   memcg: mark stat ...
304
  	struct mem_cgroup_stat_cpu __percpu *stat;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
305
306
307
308
309
310
  	/*
  	 * used when a cpu is offlined or other synchronizations
  	 * See mem_cgroup_read_stat().
  	 */
  	struct mem_cgroup_stat_cpu nocpu_base;
  	spinlock_t pcp_counter_lock;
d1a4c0b37   Glauber Costa   tcp memory pressu...
311
312
313
314
  
  #ifdef CONFIG_INET
  	struct tcp_memcontrol tcp_mem;
  #endif
8cdea7c05   Balbir Singh   Memory controller...
315
  };
7dc74be03   Daisuke Nishimura   memcg: add interf...
316
317
318
319
320
321
  /* Stuffs for move charges at task migration. */
  /*
   * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
   * left-shifted bitmap of these types.
   */
  enum move_type {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
322
  	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
87946a722   Daisuke Nishimura   memcg: move charg...
323
  	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
7dc74be03   Daisuke Nishimura   memcg: add interf...
324
325
  	NR_MOVE_TYPE,
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
326
327
  /* "mc" and its members are protected by cgroup_mutex */
  static struct move_charge_struct {
b1dd693e5   Daisuke Nishimura   memcg: avoid dead...
328
  	spinlock_t	  lock; /* for from, to */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
329
330
331
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
  	unsigned long precharge;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
332
  	unsigned long moved_charge;
483c30b51   Daisuke Nishimura   memcg: improve pe...
333
  	unsigned long moved_swap;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
334
335
336
  	struct task_struct *moving_task;	/* a task moving charges */
  	wait_queue_head_t waitq;		/* a waitq for other context */
  } mc = {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
337
  	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
338
339
  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
340

90254a658   Daisuke Nishimura   memcg: clean up m...
341
342
343
344
345
  static bool move_anon(void)
  {
  	return test_bit(MOVE_CHARGE_TYPE_ANON,
  					&mc.to->move_charge_at_immigrate);
  }
87946a722   Daisuke Nishimura   memcg: move charg...
346
347
348
349
350
  static bool move_file(void)
  {
  	return test_bit(MOVE_CHARGE_TYPE_FILE,
  					&mc.to->move_charge_at_immigrate);
  }
4e4169535   Balbir Singh   memory controller...
351
352
353
354
  /*
   * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
355
356
  #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
  #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
4e4169535   Balbir Singh   memory controller...
357

217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
358
359
  enum charge_type {
  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
360
  	MEM_CGROUP_CHARGE_TYPE_ANON,
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
361
  	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
362
  	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
363
364
  	NR_CHARGE_TYPE,
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
365
  /* for encoding cft->private value on file */
65c64ce8e   Glauber Costa   Partial revert "B...
366
367
368
  #define _MEM			(0)
  #define _MEMSWAP		(1)
  #define _OOM_TYPE		(2)
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
369
370
  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
371
  #define MEMFILE_ATTR(val)	((val) & 0xffff)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
372
373
  /* Used for OOM nofiier */
  #define OOM_CONTROL		(0)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
374

75822b449   Balbir Singh   memory controller...
375
376
377
378
379
380
381
  /*
   * Reclaim flags for mem_cgroup_hierarchical_reclaim
   */
  #define MEM_CGROUP_RECLAIM_NOSWAP_BIT	0x0
  #define MEM_CGROUP_RECLAIM_NOSWAP	(1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
  #define MEM_CGROUP_RECLAIM_SHRINK_BIT	0x1
  #define MEM_CGROUP_RECLAIM_SHRINK	(1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
c0ff4b854   Raghavendra K T   memcg: rename mem...
382
383
  static void mem_cgroup_get(struct mem_cgroup *memcg);
  static void mem_cgroup_put(struct mem_cgroup *memcg);
e1aab161e   Glauber Costa   socket: initial c...
384

b21451459   Wanpeng Li   memcg: add mem_cg...
385
386
387
388
389
  static inline
  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
  {
  	return container_of(s, struct mem_cgroup, css);
  }
e1aab161e   Glauber Costa   socket: initial c...
390
  /* Writing them here to avoid exposing memcg's inner layout */
c255a4580   Andrew Morton   memcg: rename con...
391
  #ifdef CONFIG_MEMCG_KMEM
e1aab161e   Glauber Costa   socket: initial c...
392
  #include <net/sock.h>
d1a4c0b37   Glauber Costa   tcp memory pressu...
393
  #include <net/ip.h>
e1aab161e   Glauber Costa   socket: initial c...
394
395
396
397
  
  static bool mem_cgroup_is_root(struct mem_cgroup *memcg);
  void sock_update_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
398
  	if (mem_cgroup_sockets_enabled) {
e1aab161e   Glauber Costa   socket: initial c...
399
  		struct mem_cgroup *memcg;
3f1346193   Glauber Costa   memcg: decrement ...
400
  		struct cg_proto *cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
401
402
  
  		BUG_ON(!sk->sk_prot->proto_cgroup);
f3f511e1c   Glauber Costa   net: fix sock_clo...
403
404
405
406
407
408
409
410
411
412
413
414
415
  		/* Socket cloning can throw us here with sk_cgrp already
  		 * filled. It won't however, necessarily happen from
  		 * process context. So the test for root memcg given
  		 * the current task's memcg won't help us in this case.
  		 *
  		 * Respecting the original socket's memcg is a better
  		 * decision in this case.
  		 */
  		if (sk->sk_cgrp) {
  			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
  			mem_cgroup_get(sk->sk_cgrp->memcg);
  			return;
  		}
e1aab161e   Glauber Costa   socket: initial c...
416
417
  		rcu_read_lock();
  		memcg = mem_cgroup_from_task(current);
3f1346193   Glauber Costa   memcg: decrement ...
418
419
  		cg_proto = sk->sk_prot->proto_cgroup(memcg);
  		if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
e1aab161e   Glauber Costa   socket: initial c...
420
  			mem_cgroup_get(memcg);
3f1346193   Glauber Costa   memcg: decrement ...
421
  			sk->sk_cgrp = cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
422
423
424
425
426
427
428
429
  		}
  		rcu_read_unlock();
  	}
  }
  EXPORT_SYMBOL(sock_update_memcg);
  
  void sock_release_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
430
  	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
e1aab161e   Glauber Costa   socket: initial c...
431
432
433
434
435
436
  		struct mem_cgroup *memcg;
  		WARN_ON(!sk->sk_cgrp->memcg);
  		memcg = sk->sk_cgrp->memcg;
  		mem_cgroup_put(memcg);
  	}
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
437

319d3b9c9   Glauber Costa   net: move sock_up...
438
  #ifdef CONFIG_INET
d1a4c0b37   Glauber Costa   tcp memory pressu...
439
440
441
442
443
444
445
446
  struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
  {
  	if (!memcg || mem_cgroup_is_root(memcg))
  		return NULL;
  
  	return &memcg->tcp_mem.cg_proto;
  }
  EXPORT_SYMBOL(tcp_proto_cgroup);
e1aab161e   Glauber Costa   socket: initial c...
447
  #endif /* CONFIG_INET */
c255a4580   Andrew Morton   memcg: rename con...
448
  #endif /* CONFIG_MEMCG_KMEM */
e1aab161e   Glauber Costa   socket: initial c...
449

c255a4580   Andrew Morton   memcg: rename con...
450
  #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
3f1346193   Glauber Costa   memcg: decrement ...
451
452
453
454
455
456
457
458
459
460
461
  static void disarm_sock_keys(struct mem_cgroup *memcg)
  {
  	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
  		return;
  	static_key_slow_dec(&memcg_socket_limit_enabled);
  }
  #else
  static void disarm_sock_keys(struct mem_cgroup *memcg)
  {
  }
  #endif
c0ff4b854   Raghavendra K T   memcg: rename mem...
462
  static void drain_all_stock_async(struct mem_cgroup *memcg);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
463

f64c3f549   Balbir Singh   memory controller...
464
  static struct mem_cgroup_per_zone *
c0ff4b854   Raghavendra K T   memcg: rename mem...
465
  mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
f64c3f549   Balbir Singh   memory controller...
466
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
467
  	return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
468
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
469
  struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
d324236b3   Wu Fengguang   memcg: add access...
470
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
471
  	return &memcg->css;
d324236b3   Wu Fengguang   memcg: add access...
472
  }
f64c3f549   Balbir Singh   memory controller...
473
  static struct mem_cgroup_per_zone *
c0ff4b854   Raghavendra K T   memcg: rename mem...
474
  page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
f64c3f549   Balbir Singh   memory controller...
475
  {
97a6c37b3   Johannes Weiner   memcg: change pag...
476
477
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
f64c3f549   Balbir Singh   memory controller...
478

c0ff4b854   Raghavendra K T   memcg: rename mem...
479
  	return mem_cgroup_zoneinfo(memcg, nid, zid);
f64c3f549   Balbir Singh   memory controller...
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_node_zone(int nid, int zid)
  {
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_from_page(struct page *page)
  {
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
  
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static void
c0ff4b854   Raghavendra K T   memcg: rename mem...
498
  __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
f64c3f549   Balbir Singh   memory controller...
499
  				struct mem_cgroup_per_zone *mz,
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
500
501
  				struct mem_cgroup_tree_per_zone *mctz,
  				unsigned long long new_usage_in_excess)
f64c3f549   Balbir Singh   memory controller...
502
503
504
505
506
507
508
  {
  	struct rb_node **p = &mctz->rb_root.rb_node;
  	struct rb_node *parent = NULL;
  	struct mem_cgroup_per_zone *mz_node;
  
  	if (mz->on_tree)
  		return;
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
509
510
511
  	mz->usage_in_excess = new_usage_in_excess;
  	if (!mz->usage_in_excess)
  		return;
f64c3f549   Balbir Singh   memory controller...
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
  	while (*p) {
  		parent = *p;
  		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
  					tree_node);
  		if (mz->usage_in_excess < mz_node->usage_in_excess)
  			p = &(*p)->rb_left;
  		/*
  		 * We can't avoid mem cgroups that are over their soft
  		 * limit by the same amount
  		 */
  		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
  			p = &(*p)->rb_right;
  	}
  	rb_link_node(&mz->tree_node, parent, p);
  	rb_insert_color(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = true;
4e4169535   Balbir Singh   memory controller...
528
529
530
  }
  
  static void
c0ff4b854   Raghavendra K T   memcg: rename mem...
531
  __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
4e4169535   Balbir Singh   memory controller...
532
533
534
535
536
537
538
539
540
541
  				struct mem_cgroup_per_zone *mz,
  				struct mem_cgroup_tree_per_zone *mctz)
  {
  	if (!mz->on_tree)
  		return;
  	rb_erase(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = false;
  }
  
  static void
c0ff4b854   Raghavendra K T   memcg: rename mem...
542
  mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
f64c3f549   Balbir Singh   memory controller...
543
544
545
546
  				struct mem_cgroup_per_zone *mz,
  				struct mem_cgroup_tree_per_zone *mctz)
  {
  	spin_lock(&mctz->lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
547
  	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
f64c3f549   Balbir Singh   memory controller...
548
549
  	spin_unlock(&mctz->lock);
  }
f64c3f549   Balbir Singh   memory controller...
550

c0ff4b854   Raghavendra K T   memcg: rename mem...
551
  static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
f64c3f549   Balbir Singh   memory controller...
552
  {
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
553
  	unsigned long long excess;
f64c3f549   Balbir Singh   memory controller...
554
555
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
556
557
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
f64c3f549   Balbir Singh   memory controller...
558
559
560
  	mctz = soft_limit_tree_from_page(page);
  
  	/*
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
561
562
  	 * Necessary to update all ancestors when hierarchy is used.
  	 * because their event counter is not touched.
f64c3f549   Balbir Singh   memory controller...
563
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
564
565
566
  	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
  		mz = mem_cgroup_zoneinfo(memcg, nid, zid);
  		excess = res_counter_soft_limit_excess(&memcg->res);
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
567
568
569
570
  		/*
  		 * We have to update the tree if mz is on RB-tree or
  		 * mem is over its softlimit.
  		 */
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
571
  		if (excess || mz->on_tree) {
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
572
573
574
  			spin_lock(&mctz->lock);
  			/* if on-tree, remove it */
  			if (mz->on_tree)
c0ff4b854   Raghavendra K T   memcg: rename mem...
575
  				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
576
  			/*
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
577
578
  			 * Insert again. mz->usage_in_excess will be updated.
  			 * If excess is 0, no tree ops.
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
579
  			 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
580
  			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
581
582
  			spin_unlock(&mctz->lock);
  		}
f64c3f549   Balbir Singh   memory controller...
583
584
  	}
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
585
  static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
f64c3f549   Balbir Singh   memory controller...
586
587
588
589
  {
  	int node, zone;
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
3ed28fa10   Bob Liu   memcg: cleanup fo...
590
  	for_each_node(node) {
f64c3f549   Balbir Singh   memory controller...
591
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
592
  			mz = mem_cgroup_zoneinfo(memcg, node, zone);
f64c3f549   Balbir Singh   memory controller...
593
  			mctz = soft_limit_tree_node_zone(node, zone);
c0ff4b854   Raghavendra K T   memcg: rename mem...
594
  			mem_cgroup_remove_exceeded(memcg, mz, mctz);
f64c3f549   Balbir Singh   memory controller...
595
596
597
  		}
  	}
  }
4e4169535   Balbir Singh   memory controller...
598
599
600
601
  static struct mem_cgroup_per_zone *
  __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct rb_node *rightmost = NULL;
26251eaf9   KAMEZAWA Hiroyuki   memcg: fix refcnt...
602
  	struct mem_cgroup_per_zone *mz;
4e4169535   Balbir Singh   memory controller...
603
604
  
  retry:
26251eaf9   KAMEZAWA Hiroyuki   memcg: fix refcnt...
605
  	mz = NULL;
4e4169535   Balbir Singh   memory controller...
606
607
608
609
610
611
612
613
614
615
  	rightmost = rb_last(&mctz->rb_root);
  	if (!rightmost)
  		goto done;		/* Nothing to reclaim from */
  
  	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
  	/*
  	 * Remove the node now but someone else can add it back,
  	 * we will to add it back at the end of reclaim to its correct
  	 * position in the tree.
  	 */
d79154bb5   Hugh Dickins   memcg: replace me...
616
617
618
  	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
  	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
  		!css_tryget(&mz->memcg->css))
4e4169535   Balbir Singh   memory controller...
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
  		goto retry;
  done:
  	return mz;
  }
  
  static struct mem_cgroup_per_zone *
  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct mem_cgroup_per_zone *mz;
  
  	spin_lock(&mctz->lock);
  	mz = __mem_cgroup_largest_soft_limit_node(mctz);
  	spin_unlock(&mctz->lock);
  	return mz;
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
  /*
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
   * a periodic synchronizion of counter in memcg's counter.
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
   * he accounts memory. Even if we provide quick-and-fuzzy read, we always
   * have to visit all online cpus and make sum. So, for now, unnecessary
   * synchronization is not implemented. (just implemented for cpu hotplug)
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
   * common workload, threashold and synchonization as vmstat[] should be
   * implemented.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
653
  static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
7a159cc9d   Johannes Weiner   memcg: use native...
654
  				 enum mem_cgroup_stat_index idx)
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
655
  {
7a159cc9d   Johannes Weiner   memcg: use native...
656
  	long val = 0;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
657
  	int cpu;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
658

711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
659
660
  	get_online_cpus();
  	for_each_online_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
661
  		val += per_cpu(memcg->stat->count[idx], cpu);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
662
  #ifdef CONFIG_HOTPLUG_CPU
c0ff4b854   Raghavendra K T   memcg: rename mem...
663
664
665
  	spin_lock(&memcg->pcp_counter_lock);
  	val += memcg->nocpu_base.count[idx];
  	spin_unlock(&memcg->pcp_counter_lock);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
666
667
  #endif
  	put_online_cpus();
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
668
669
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
670
  static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
0c3e73e84   Balbir Singh   memcg: improve re...
671
672
673
  					 bool charge)
  {
  	int val = (charge) ? 1 : -1;
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
674
  	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
0c3e73e84   Balbir Singh   memcg: improve re...
675
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
676
  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
e9f8974f2   Johannes Weiner   memcg: break out ...
677
678
679
680
681
682
  					    enum mem_cgroup_events_index idx)
  {
  	unsigned long val = 0;
  	int cpu;
  
  	for_each_online_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
683
  		val += per_cpu(memcg->stat->events[idx], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
684
  #ifdef CONFIG_HOTPLUG_CPU
c0ff4b854   Raghavendra K T   memcg: rename mem...
685
686
687
  	spin_lock(&memcg->pcp_counter_lock);
  	val += memcg->nocpu_base.events[idx];
  	spin_unlock(&memcg->pcp_counter_lock);
e9f8974f2   Johannes Weiner   memcg: break out ...
688
689
690
  #endif
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
691
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
692
  					 bool anon, int nr_pages)
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
693
  {
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
694
  	preempt_disable();
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
695
696
697
698
699
700
  	/*
  	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
  	 * counted as CACHE even if it's on ANON LRU.
  	 */
  	if (anon)
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
c0ff4b854   Raghavendra K T   memcg: rename mem...
701
  				nr_pages);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
702
  	else
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
703
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
c0ff4b854   Raghavendra K T   memcg: rename mem...
704
  				nr_pages);
55e462b05   Balaji Rao   memcg: simple sta...
705

e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
706
707
  	/* pagein of a big page is an event. So, ignore page size */
  	if (nr_pages > 0)
c0ff4b854   Raghavendra K T   memcg: rename mem...
708
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
709
  	else {
c0ff4b854   Raghavendra K T   memcg: rename mem...
710
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
711
712
  		nr_pages = -nr_pages; /* for event */
  	}
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
713

13114716c   Johannes Weiner   mm: memcg: keep r...
714
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
715

c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
716
  	preempt_enable();
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
717
  }
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
718
  unsigned long
4d7dcca21   Hugh Dickins   mm/memcg: get_lru...
719
  mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
074291fea   Konstantin Khlebnikov   mm/vmscan: replac...
720
721
722
723
724
725
726
727
  {
  	struct mem_cgroup_per_zone *mz;
  
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	return mz->lru_size[lru];
  }
  
  static unsigned long
c0ff4b854   Raghavendra K T   memcg: rename mem...
728
  mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
729
  			unsigned int lru_mask)
889976dbc   Ying Han   memcg: reclaim me...
730
731
  {
  	struct mem_cgroup_per_zone *mz;
f156ab933   Hugh Dickins   memcg: enum lru_l...
732
  	enum lru_list lru;
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
733
  	unsigned long ret = 0;
c0ff4b854   Raghavendra K T   memcg: rename mem...
734
  	mz = mem_cgroup_zoneinfo(memcg, nid, zid);
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
735

f156ab933   Hugh Dickins   memcg: enum lru_l...
736
737
738
  	for_each_lru(lru) {
  		if (BIT(lru) & lru_mask)
  			ret += mz->lru_size[lru];
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
739
740
741
742
743
  	}
  	return ret;
  }
  
  static unsigned long
c0ff4b854   Raghavendra K T   memcg: rename mem...
744
  mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
745
746
  			int nid, unsigned int lru_mask)
  {
889976dbc   Ying Han   memcg: reclaim me...
747
748
  	u64 total = 0;
  	int zid;
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
749
  	for (zid = 0; zid < MAX_NR_ZONES; zid++)
c0ff4b854   Raghavendra K T   memcg: rename mem...
750
751
  		total += mem_cgroup_zone_nr_lru_pages(memcg,
  						nid, zid, lru_mask);
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
752

889976dbc   Ying Han   memcg: reclaim me...
753
754
  	return total;
  }
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
755

c0ff4b854   Raghavendra K T   memcg: rename mem...
756
  static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
757
  			unsigned int lru_mask)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
758
  {
889976dbc   Ying Han   memcg: reclaim me...
759
  	int nid;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
760
  	u64 total = 0;
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
761
  	for_each_node_state(nid, N_HIGH_MEMORY)
c0ff4b854   Raghavendra K T   memcg: rename mem...
762
  		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
763
  	return total;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
764
  }
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
765
766
  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  				       enum mem_cgroup_events_target target)
7a159cc9d   Johannes Weiner   memcg: use native...
767
768
  {
  	unsigned long val, next;
13114716c   Johannes Weiner   mm: memcg: keep r...
769
  	val = __this_cpu_read(memcg->stat->nr_page_events);
4799401fe   Steven Rostedt   memcg: Fix race c...
770
  	next = __this_cpu_read(memcg->stat->targets[target]);
7a159cc9d   Johannes Weiner   memcg: use native...
771
  	/* from time_after() in jiffies.h */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
  	if ((long)next - (long)val < 0) {
  		switch (target) {
  		case MEM_CGROUP_TARGET_THRESH:
  			next = val + THRESHOLDS_EVENTS_TARGET;
  			break;
  		case MEM_CGROUP_TARGET_SOFTLIMIT:
  			next = val + SOFTLIMIT_EVENTS_TARGET;
  			break;
  		case MEM_CGROUP_TARGET_NUMAINFO:
  			next = val + NUMAINFO_EVENTS_TARGET;
  			break;
  		default:
  			break;
  		}
  		__this_cpu_write(memcg->stat->targets[target], next);
  		return true;
7a159cc9d   Johannes Weiner   memcg: use native...
788
  	}
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
789
  	return false;
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
790
791
792
793
794
795
  }
  
  /*
   * Check events in order.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
796
  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
797
  {
4799401fe   Steven Rostedt   memcg: Fix race c...
798
  	preempt_disable();
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
799
  	/* threshold event is triggered in finer grain than soft limit */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
800
801
  	if (unlikely(mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_THRESH))) {
82b3f2a71   Andrew Morton   mm/memcontrol.c: ...
802
803
  		bool do_softlimit;
  		bool do_numainfo __maybe_unused;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
804
805
806
807
808
809
810
811
  
  		do_softlimit = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_SOFTLIMIT);
  #if MAX_NUMNODES > 1
  		do_numainfo = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_NUMAINFO);
  #endif
  		preempt_enable();
c0ff4b854   Raghavendra K T   memcg: rename mem...
812
  		mem_cgroup_threshold(memcg);
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
813
  		if (unlikely(do_softlimit))
c0ff4b854   Raghavendra K T   memcg: rename mem...
814
  			mem_cgroup_update_tree(memcg, page);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
815
  #if MAX_NUMNODES > 1
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
816
  		if (unlikely(do_numainfo))
c0ff4b854   Raghavendra K T   memcg: rename mem...
817
  			atomic_inc(&memcg->numainfo_events);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
818
  #endif
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
819
820
  	} else
  		preempt_enable();
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
821
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
822
  struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
8cdea7c05   Balbir Singh   Memory controller...
823
  {
b21451459   Wanpeng Li   memcg: add mem_cg...
824
825
  	return mem_cgroup_from_css(
  		cgroup_subsys_state(cont, mem_cgroup_subsys_id));
8cdea7c05   Balbir Singh   Memory controller...
826
  }
cf475ad28   Balbir Singh   cgroups: add an o...
827
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb74669   Pavel Emelianov   Memory controller...
828
  {
31a78f23b   Balbir Singh   mm owner: fix rac...
829
830
831
832
833
834
835
  	/*
  	 * mm_update_next_owner() may clear mm->owner to NULL
  	 * if it races with swapoff, page migration, etc.
  	 * So this can be called with p == NULL.
  	 */
  	if (unlikely(!p))
  		return NULL;
b21451459   Wanpeng Li   memcg: add mem_cg...
836
  	return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
78fb74669   Pavel Emelianov   Memory controller...
837
  }
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
838
  struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
839
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
840
  	struct mem_cgroup *memcg = NULL;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
841
842
843
  
  	if (!mm)
  		return NULL;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
844
845
846
847
848
849
850
  	/*
  	 * Because we have no locks, mm->owner's may be being moved to other
  	 * cgroup. We use css_tryget() here even if this looks
  	 * pessimistic (rather than adding locks here).
  	 */
  	rcu_read_lock();
  	do {
c0ff4b854   Raghavendra K T   memcg: rename mem...
851
852
  		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  		if (unlikely(!memcg))
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
853
  			break;
c0ff4b854   Raghavendra K T   memcg: rename mem...
854
  	} while (!css_tryget(&memcg->css));
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
855
  	rcu_read_unlock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
856
  	return memcg;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
857
  }
5660048cc   Johannes Weiner   mm: move memcg hi...
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
  /**
   * mem_cgroup_iter - iterate over memory cgroup hierarchy
   * @root: hierarchy root
   * @prev: previously returned memcg, NULL on first invocation
   * @reclaim: cookie for shared reclaim walks, NULL for full walks
   *
   * Returns references to children of the hierarchy below @root, or
   * @root itself, or %NULL after a full round-trip.
   *
   * Caller must pass the return value in @prev on subsequent
   * invocations for reference counting, or use mem_cgroup_iter_break()
   * to cancel a hierarchy walk before the round-trip is complete.
   *
   * Reclaimers can specify a zone and a priority level in @reclaim to
   * divide up the memcgs in the hierarchy among all concurrent
   * reclaimers operating on the same zone and priority.
   */
  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
  				   struct mem_cgroup *prev,
  				   struct mem_cgroup_reclaim_cookie *reclaim)
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
878
  {
9f3a0d093   Johannes Weiner   mm: memcg: consol...
879
880
  	struct mem_cgroup *memcg = NULL;
  	int id = 0;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
881

5660048cc   Johannes Weiner   mm: move memcg hi...
882
883
  	if (mem_cgroup_disabled())
  		return NULL;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
884
885
  	if (!root)
  		root = root_mem_cgroup;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
886

9f3a0d093   Johannes Weiner   mm: memcg: consol...
887
888
  	if (prev && !reclaim)
  		id = css_id(&prev->css);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
889

9f3a0d093   Johannes Weiner   mm: memcg: consol...
890
891
  	if (prev && prev != root)
  		css_put(&prev->css);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
892

9f3a0d093   Johannes Weiner   mm: memcg: consol...
893
894
895
896
897
  	if (!root->use_hierarchy && root != root_mem_cgroup) {
  		if (prev)
  			return NULL;
  		return root;
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
898

9f3a0d093   Johannes Weiner   mm: memcg: consol...
899
  	while (!memcg) {
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
900
  		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
901
  		struct cgroup_subsys_state *css;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
902

527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
903
904
905
906
907
908
909
910
911
912
913
  		if (reclaim) {
  			int nid = zone_to_nid(reclaim->zone);
  			int zid = zone_idx(reclaim->zone);
  			struct mem_cgroup_per_zone *mz;
  
  			mz = mem_cgroup_zoneinfo(root, nid, zid);
  			iter = &mz->reclaim_iter[reclaim->priority];
  			if (prev && reclaim->generation != iter->generation)
  				return NULL;
  			id = iter->position;
  		}
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
914

9f3a0d093   Johannes Weiner   mm: memcg: consol...
915
916
917
918
  		rcu_read_lock();
  		css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
  		if (css) {
  			if (css == &root->css || css_tryget(css))
b21451459   Wanpeng Li   memcg: add mem_cg...
919
  				memcg = mem_cgroup_from_css(css);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
920
921
  		} else
  			id = 0;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
922
  		rcu_read_unlock();
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
923

527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
924
925
926
927
928
929
930
  		if (reclaim) {
  			iter->position = id;
  			if (!css)
  				iter->generation++;
  			else if (!prev && memcg)
  				reclaim->generation = iter->generation;
  		}
9f3a0d093   Johannes Weiner   mm: memcg: consol...
931
932
933
934
935
  
  		if (prev && !css)
  			return NULL;
  	}
  	return memcg;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
936
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
937

5660048cc   Johannes Weiner   mm: move memcg hi...
938
939
940
941
942
943
944
  /**
   * mem_cgroup_iter_break - abort a hierarchy walk prematurely
   * @root: hierarchy root
   * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
   */
  void mem_cgroup_iter_break(struct mem_cgroup *root,
  			   struct mem_cgroup *prev)
9f3a0d093   Johannes Weiner   mm: memcg: consol...
945
946
947
948
949
950
  {
  	if (!root)
  		root = root_mem_cgroup;
  	if (prev && prev != root)
  		css_put(&prev->css);
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
951

9f3a0d093   Johannes Weiner   mm: memcg: consol...
952
953
954
955
956
957
  /*
   * Iteration constructs for visiting all cgroups (under a tree).  If
   * loops are exited prematurely (break), mem_cgroup_iter_break() must
   * be used for reference counting.
   */
  #define for_each_mem_cgroup_tree(iter, root)		\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
958
  	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
959
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
960
  	     iter = mem_cgroup_iter(root, iter, NULL))
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
961

9f3a0d093   Johannes Weiner   mm: memcg: consol...
962
  #define for_each_mem_cgroup(iter)			\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
963
  	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
964
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
965
  	     iter = mem_cgroup_iter(NULL, iter, NULL))
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
966

c0ff4b854   Raghavendra K T   memcg: rename mem...
967
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
4b3bde4c9   Balbir Singh   memcg: remove the...
968
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
969
  	return (memcg == root_mem_cgroup);
4b3bde4c9   Balbir Singh   memcg: remove the...
970
  }
456f998ec   Ying Han   memcg: add the pa...
971
972
  void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
973
  	struct mem_cgroup *memcg;
456f998ec   Ying Han   memcg: add the pa...
974
975
976
977
978
  
  	if (!mm)
  		return;
  
  	rcu_read_lock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
979
980
  	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  	if (unlikely(!memcg))
456f998ec   Ying Han   memcg: add the pa...
981
982
983
  		goto out;
  
  	switch (idx) {
456f998ec   Ying Han   memcg: add the pa...
984
  	case PGFAULT:
0e574a932   Johannes Weiner   mm: memcg: clean ...
985
986
987
988
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
  		break;
  	case PGMAJFAULT:
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
456f998ec   Ying Han   memcg: add the pa...
989
990
991
992
993
994
995
996
  		break;
  	default:
  		BUG();
  	}
  out:
  	rcu_read_unlock();
  }
  EXPORT_SYMBOL(mem_cgroup_count_vm_event);
925b7673c   Johannes Weiner   mm: make per-memc...
997
998
999
  /**
   * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
   * @zone: zone of the wanted lruvec
fa9add641   Hugh Dickins   mm/memcg: apply a...
1000
   * @memcg: memcg of the wanted lruvec
925b7673c   Johannes Weiner   mm: make per-memc...
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
   *
   * Returns the lru list vector holding pages for the given @zone and
   * @mem.  This can be the global zone lruvec, if the memory controller
   * is disabled.
   */
  struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
  				      struct mem_cgroup *memcg)
  {
  	struct mem_cgroup_per_zone *mz;
  
  	if (mem_cgroup_disabled())
  		return &zone->lruvec;
  
  	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
  	return &mz->lruvec;
  }
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
  /*
   * Following LRU functions are allowed to be used without PCG_LOCK.
   * Operations are called by routine of global LRU independently from memcg.
   * What we have to take care of here is validness of pc->mem_cgroup.
   *
   * Changes to pc->mem_cgroup happens when
   * 1. charge
   * 2. moving account
   * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
   * It is added to LRU before charge.
   * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
   * When moving account, the page is not on LRU. It's isolated.
   */
4f98a2fee   Rik van Riel   vmscan: split LRU...
1030

925b7673c   Johannes Weiner   mm: make per-memc...
1031
  /**
fa9add641   Hugh Dickins   mm/memcg: apply a...
1032
   * mem_cgroup_page_lruvec - return lruvec for adding an lru page
925b7673c   Johannes Weiner   mm: make per-memc...
1033
   * @page: the page
fa9add641   Hugh Dickins   mm/memcg: apply a...
1034
   * @zone: zone of the page
925b7673c   Johannes Weiner   mm: make per-memc...
1035
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1036
  struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1037
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1038
  	struct mem_cgroup_per_zone *mz;
925b7673c   Johannes Weiner   mm: make per-memc...
1039
1040
  	struct mem_cgroup *memcg;
  	struct page_cgroup *pc;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
1041

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
1042
  	if (mem_cgroup_disabled())
925b7673c   Johannes Weiner   mm: make per-memc...
1043
  		return &zone->lruvec;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1044
  	pc = lookup_page_cgroup(page);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
1045
  	memcg = pc->mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1046
1047
  
  	/*
fa9add641   Hugh Dickins   mm/memcg: apply a...
1048
  	 * Surreptitiously switch any uncharged offlist page to root:
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1049
1050
1051
1052
1053
1054
1055
  	 * an uncharged page off lru does nothing to secure
  	 * its former mem_cgroup from sudden removal.
  	 *
  	 * Our caller holds lru_lock, and PageCgroupUsed is updated
  	 * under page_cgroup lock: between them, they make all uses
  	 * of pc->mem_cgroup safe.
  	 */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1056
  	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1057
  		pc->mem_cgroup = memcg = root_mem_cgroup;
925b7673c   Johannes Weiner   mm: make per-memc...
1058
  	mz = page_cgroup_zoneinfo(memcg, page);
925b7673c   Johannes Weiner   mm: make per-memc...
1059
  	return &mz->lruvec;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1060
  }
b69408e88   Christoph Lameter   vmscan: Use an in...
1061

925b7673c   Johannes Weiner   mm: make per-memc...
1062
  /**
fa9add641   Hugh Dickins   mm/memcg: apply a...
1063
1064
1065
1066
   * mem_cgroup_update_lru_size - account for adding or removing an lru page
   * @lruvec: mem_cgroup per zone lru vector
   * @lru: index of lru list the page is sitting on
   * @nr_pages: positive when adding or negative when removing
925b7673c   Johannes Weiner   mm: make per-memc...
1067
   *
fa9add641   Hugh Dickins   mm/memcg: apply a...
1068
1069
   * This function must be called when a page is added to or removed from an
   * lru list.
3f58a8294   Minchan Kim   memcg: move memcg...
1070
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1071
1072
  void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
  				int nr_pages)
3f58a8294   Minchan Kim   memcg: move memcg...
1073
1074
  {
  	struct mem_cgroup_per_zone *mz;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1075
  	unsigned long *lru_size;
3f58a8294   Minchan Kim   memcg: move memcg...
1076
1077
1078
  
  	if (mem_cgroup_disabled())
  		return;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1079
1080
1081
1082
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	lru_size = mz->lru_size + lru;
  	*lru_size += nr_pages;
  	VM_BUG_ON((long)(*lru_size) < 0);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1083
  }
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
1084

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1085
  /*
c0ff4b854   Raghavendra K T   memcg: rename mem...
1086
   * Checks whether given mem is same or in the root_mem_cgroup's
3e92041d6   Michal Hocko   memcg: add mem_cg...
1087
1088
   * hierarchy subtree
   */
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1089
1090
  bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
  				  struct mem_cgroup *memcg)
3e92041d6   Michal Hocko   memcg: add mem_cg...
1091
  {
91c63734f   Johannes Weiner   kernel: cgroup: p...
1092
1093
  	if (root_memcg == memcg)
  		return true;
3a981f482   Hugh Dickins   memcg: fix use_hi...
1094
  	if (!root_memcg->use_hierarchy || !memcg)
91c63734f   Johannes Weiner   kernel: cgroup: p...
1095
  		return false;
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1096
1097
1098
1099
1100
1101
1102
  	return css_is_ancestor(&memcg->css, &root_memcg->css);
  }
  
  static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
  				       struct mem_cgroup *memcg)
  {
  	bool ret;
91c63734f   Johannes Weiner   kernel: cgroup: p...
1103
  	rcu_read_lock();
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1104
  	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
91c63734f   Johannes Weiner   kernel: cgroup: p...
1105
1106
  	rcu_read_unlock();
  	return ret;
3e92041d6   Michal Hocko   memcg: add mem_cg...
1107
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1108
  int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
4c4a22148   David Rientjes   memcontrol: move ...
1109
1110
  {
  	int ret;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1111
  	struct mem_cgroup *curr = NULL;
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1112
  	struct task_struct *p;
4c4a22148   David Rientjes   memcontrol: move ...
1113

158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1114
  	p = find_lock_task_mm(task);
de077d222   David Rientjes   oom, memcg: fix e...
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
  	if (p) {
  		curr = try_get_mem_cgroup_from_mm(p->mm);
  		task_unlock(p);
  	} else {
  		/*
  		 * All threads may have already detached their mm's, but the oom
  		 * killer still needs to detect if they have already been oom
  		 * killed to prevent needlessly killing additional tasks.
  		 */
  		task_lock(task);
  		curr = mem_cgroup_from_task(task);
  		if (curr)
  			css_get(&curr->css);
  		task_unlock(task);
  	}
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1130
1131
  	if (!curr)
  		return 0;
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1132
  	/*
c0ff4b854   Raghavendra K T   memcg: rename mem...
1133
  	 * We should check use_hierarchy of "memcg" not "curr". Because checking
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1134
  	 * use_hierarchy of "curr" here make this function true if hierarchy is
c0ff4b854   Raghavendra K T   memcg: rename mem...
1135
1136
  	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
  	 * hierarchy(even if use_hierarchy is disabled in "memcg").
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1137
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1138
  	ret = mem_cgroup_same_or_subtree(memcg, curr);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1139
  	css_put(&curr->css);
4c4a22148   David Rientjes   memcontrol: move ...
1140
1141
  	return ret;
  }
c56d5c7df   Konstantin Khlebnikov   mm/vmscan: push l...
1142
  int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
14797e236   KOSAKI Motohiro   memcg: add inacti...
1143
  {
9b272977e   Johannes Weiner   memcg: skip scann...
1144
  	unsigned long inactive_ratio;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1145
  	unsigned long inactive;
9b272977e   Johannes Weiner   memcg: skip scann...
1146
  	unsigned long active;
c772be939   KOSAKI Motohiro   memcg: fix calcul...
1147
  	unsigned long gb;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1148

4d7dcca21   Hugh Dickins   mm/memcg: get_lru...
1149
1150
  	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
  	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
14797e236   KOSAKI Motohiro   memcg: add inacti...
1151

c772be939   KOSAKI Motohiro   memcg: fix calcul...
1152
1153
1154
1155
1156
  	gb = (inactive + active) >> (30 - PAGE_SHIFT);
  	if (gb)
  		inactive_ratio = int_sqrt(10 * gb);
  	else
  		inactive_ratio = 1;
9b272977e   Johannes Weiner   memcg: skip scann...
1157
  	return inactive * inactive_ratio < active;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1158
  }
c56d5c7df   Konstantin Khlebnikov   mm/vmscan: push l...
1159
  int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
56e49d218   Rik van Riel   vmscan: evict use...
1160
1161
1162
  {
  	unsigned long active;
  	unsigned long inactive;
4d7dcca21   Hugh Dickins   mm/memcg: get_lru...
1163
1164
  	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
  	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
56e49d218   Rik van Riel   vmscan: evict use...
1165
1166
1167
  
  	return (active > inactive);
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1168
1169
  #define mem_cgroup_from_res_counter(counter, member)	\
  	container_of(counter, struct mem_cgroup, member)
19942822d   Johannes Weiner   memcg: prevent en...
1170
  /**
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1171
   * mem_cgroup_margin - calculate chargeable space of a memory cgroup
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1172
   * @memcg: the memory cgroup
19942822d   Johannes Weiner   memcg: prevent en...
1173
   *
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1174
   * Returns the maximum amount of memory @mem can be charged with, in
7ec99d621   Johannes Weiner   memcg: unify char...
1175
   * pages.
19942822d   Johannes Weiner   memcg: prevent en...
1176
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1177
  static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
19942822d   Johannes Weiner   memcg: prevent en...
1178
  {
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1179
  	unsigned long long margin;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1180
  	margin = res_counter_margin(&memcg->res);
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1181
  	if (do_swap_account)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1182
  		margin = min(margin, res_counter_margin(&memcg->memsw));
7ec99d621   Johannes Weiner   memcg: unify char...
1183
  	return margin >> PAGE_SHIFT;
19942822d   Johannes Weiner   memcg: prevent en...
1184
  }
1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
1185
  int mem_cgroup_swappiness(struct mem_cgroup *memcg)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1186
1187
  {
  	struct cgroup *cgrp = memcg->css.cgroup;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1188
1189
1190
1191
  
  	/* root ? */
  	if (cgrp->parent == NULL)
  		return vm_swappiness;
bf1ff2635   Johannes Weiner   memcg: remove mem...
1192
  	return memcg->swappiness;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1193
  }
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
  /*
   * memcg->moving_account is used for checking possibility that some thread is
   * calling move_account(). When a thread on CPU-A starts moving pages under
   * a memcg, other threads should check memcg->moving_account under
   * rcu_read_lock(), like this:
   *
   *         CPU-A                                    CPU-B
   *                                              rcu_read_lock()
   *         memcg->moving_account+1              if (memcg->mocing_account)
   *                                                   take heavy locks.
   *         synchronize_rcu()                    update something.
   *                                              rcu_read_unlock()
   *         start move here.
   */
4331f7d33   KAMEZAWA Hiroyuki   memcg: fix perfor...
1208
1209
1210
  
  /* for quick checking without looking up memcg */
  atomic_t memcg_moving __read_mostly;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1211
  static void mem_cgroup_start_move(struct mem_cgroup *memcg)
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1212
  {
4331f7d33   KAMEZAWA Hiroyuki   memcg: fix perfor...
1213
  	atomic_inc(&memcg_moving);
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1214
  	atomic_inc(&memcg->moving_account);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1215
1216
  	synchronize_rcu();
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1217
  static void mem_cgroup_end_move(struct mem_cgroup *memcg)
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1218
  {
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1219
1220
1221
1222
  	/*
  	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
  	 * We check NULL in callee rather than caller.
  	 */
4331f7d33   KAMEZAWA Hiroyuki   memcg: fix perfor...
1223
1224
  	if (memcg) {
  		atomic_dec(&memcg_moving);
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1225
  		atomic_dec(&memcg->moving_account);
4331f7d33   KAMEZAWA Hiroyuki   memcg: fix perfor...
1226
  	}
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1227
  }
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1228

32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1229
1230
1231
  /*
   * 2 routines for checking "mem" is under move_account() or not.
   *
13fd1dd9d   Andrew Morton   mm/memcontrol.c: ...
1232
1233
   * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This
   *			  is used for avoiding races in accounting.  If true,
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1234
1235
1236
1237
1238
1239
   *			  pc->mem_cgroup may be overwritten.
   *
   * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
   *			  under hierarchy of moving cgroups. This is for
   *			  waiting at hith-memory prressure caused by "move".
   */
13fd1dd9d   Andrew Morton   mm/memcontrol.c: ...
1240
  static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1241
1242
  {
  	VM_BUG_ON(!rcu_read_lock_held());
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1243
  	return atomic_read(&memcg->moving_account) > 0;
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1244
  }
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1245

c0ff4b854   Raghavendra K T   memcg: rename mem...
1246
  static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1247
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1248
1249
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1250
  	bool ret = false;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1251
1252
1253
1254
1255
1256
1257
1258
1259
  	/*
  	 * Unlike task_move routines, we access mc.to, mc.from not under
  	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
  	 */
  	spin_lock(&mc.lock);
  	from = mc.from;
  	to = mc.to;
  	if (!from)
  		goto unlock;
3e92041d6   Michal Hocko   memcg: add mem_cg...
1260

c0ff4b854   Raghavendra K T   memcg: rename mem...
1261
1262
  	ret = mem_cgroup_same_or_subtree(memcg, from)
  		|| mem_cgroup_same_or_subtree(memcg, to);
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1263
1264
  unlock:
  	spin_unlock(&mc.lock);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1265
1266
  	return ret;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1267
  static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1268
1269
  {
  	if (mc.moving_task && current != mc.moving_task) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1270
  		if (mem_cgroup_under_move(memcg)) {
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
  			DEFINE_WAIT(wait);
  			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
  			/* moving charge context might have finished. */
  			if (mc.moving_task)
  				schedule();
  			finish_wait(&mc.waitq, &wait);
  			return true;
  		}
  	}
  	return false;
  }
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
1282
1283
1284
1285
  /*
   * Take this lock when
   * - a code tries to modify page's memcg while it's USED.
   * - a code tries to modify page state accounting in a memcg.
13fd1dd9d   Andrew Morton   mm/memcontrol.c: ...
1286
   * see mem_cgroup_stolen(), too.
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
   */
  static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
  				  unsigned long *flags)
  {
  	spin_lock_irqsave(&memcg->move_lock, *flags);
  }
  
  static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
  				unsigned long *flags)
  {
  	spin_unlock_irqrestore(&memcg->move_lock, *flags);
  }
e222432bf   Balbir Singh   memcg: show memcg...
1299
  /**
6a6135b64   Kirill A. Shutemov   memcg: typo in co...
1300
   * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
e222432bf   Balbir Singh   memcg: show memcg...
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
   * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
   * enabled
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
  	struct cgroup *task_cgrp;
  	struct cgroup *mem_cgrp;
  	/*
  	 * Need a buffer in BSS, can't rely on allocations. The code relies
  	 * on the assumption that OOM is serialized for memory controller.
  	 * If this assumption is broken, revisit this code.
  	 */
  	static char memcg_name[PATH_MAX];
  	int ret;
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1318
  	if (!memcg || !p)
e222432bf   Balbir Singh   memcg: show memcg...
1319
  		return;
e222432bf   Balbir Singh   memcg: show memcg...
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
  	rcu_read_lock();
  
  	mem_cgrp = memcg->css.cgroup;
  	task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
  
  	ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
  	if (ret < 0) {
  		/*
  		 * Unfortunately, we are unable to convert to a useful name
  		 * But we'll still print out the usage information
  		 */
  		rcu_read_unlock();
  		goto done;
  	}
  	rcu_read_unlock();
  
  	printk(KERN_INFO "Task in %s killed", memcg_name);
  
  	rcu_read_lock();
  	ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
  	if (ret < 0) {
  		rcu_read_unlock();
  		goto done;
  	}
  	rcu_read_unlock();
  
  	/*
  	 * Continues from above, so we don't need an KERN_ level
  	 */
  	printk(KERN_CONT " as a result of limit of %s
  ", memcg_name);
  done:
  
  	printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu
  ",
  		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
  		res_counter_read_u64(&memcg->res, RES_FAILCNT));
  	printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
  		"failcnt %llu
  ",
  		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
  		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
  }
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1365
1366
1367
1368
  /*
   * This function returns the number of memcg under hierarchy tree. Returns
   * 1(self count) if no children.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1369
  static int mem_cgroup_count_children(struct mem_cgroup *memcg)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1370
1371
  {
  	int num = 0;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1372
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1373
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1374
  		num++;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1375
1376
  	return num;
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1377
  /*
a63d83f42   David Rientjes   oom: badness heur...
1378
1379
   * Return the memory (and swap, if configured) limit for a memcg.
   */
9cbb78bb3   David Rientjes   mm, memcg: introd...
1380
  static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
a63d83f42   David Rientjes   oom: badness heur...
1381
1382
1383
  {
  	u64 limit;
  	u64 memsw;
f3e8eb70b   Johannes Weiner   memcg: fix unit m...
1384
1385
  	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  	limit += total_swap_pages << PAGE_SHIFT;
a63d83f42   David Rientjes   oom: badness heur...
1386
1387
1388
1389
1390
1391
1392
  	memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  	/*
  	 * If memsw is finite and limits the amount of swap space available
  	 * to this memcg, return that limit.
  	 */
  	return min(limit, memsw);
  }
876aafbfd   David Rientjes   mm, memcg: move a...
1393
1394
  void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
  			      int order)
9cbb78bb3   David Rientjes   mm, memcg: introd...
1395
1396
1397
1398
1399
1400
  {
  	struct mem_cgroup *iter;
  	unsigned long chosen_points = 0;
  	unsigned long totalpages;
  	unsigned int points = 0;
  	struct task_struct *chosen = NULL;
876aafbfd   David Rientjes   mm, memcg: move a...
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
  	/*
  	 * If current has a pending SIGKILL, then automatically select it.  The
  	 * goal is to allow it to allocate so that it may quickly exit and free
  	 * its memory.
  	 */
  	if (fatal_signal_pending(current)) {
  		set_thread_flag(TIF_MEMDIE);
  		return;
  	}
  
  	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
  	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
  	for_each_mem_cgroup_tree(iter, memcg) {
  		struct cgroup *cgroup = iter->css.cgroup;
  		struct cgroup_iter it;
  		struct task_struct *task;
  
  		cgroup_iter_start(cgroup, &it);
  		while ((task = cgroup_iter_next(cgroup, &it))) {
  			switch (oom_scan_process_thread(task, totalpages, NULL,
  							false)) {
  			case OOM_SCAN_SELECT:
  				if (chosen)
  					put_task_struct(chosen);
  				chosen = task;
  				chosen_points = ULONG_MAX;
  				get_task_struct(chosen);
  				/* fall through */
  			case OOM_SCAN_CONTINUE:
  				continue;
  			case OOM_SCAN_ABORT:
  				cgroup_iter_end(cgroup, &it);
  				mem_cgroup_iter_break(memcg, iter);
  				if (chosen)
  					put_task_struct(chosen);
  				return;
  			case OOM_SCAN_OK:
  				break;
  			};
  			points = oom_badness(task, memcg, NULL, totalpages);
  			if (points > chosen_points) {
  				if (chosen)
  					put_task_struct(chosen);
  				chosen = task;
  				chosen_points = points;
  				get_task_struct(chosen);
  			}
  		}
  		cgroup_iter_end(cgroup, &it);
  	}
  
  	if (!chosen)
  		return;
  	points = chosen_points * 1000 / totalpages;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1455
1456
  	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
  			 NULL, "Memory cgroup out of memory");
9cbb78bb3   David Rientjes   mm, memcg: introd...
1457
  }
5660048cc   Johannes Weiner   mm: move memcg hi...
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
  static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
  					gfp_t gfp_mask,
  					unsigned long flags)
  {
  	unsigned long total = 0;
  	bool noswap = false;
  	int loop;
  
  	if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
  		noswap = true;
  	if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
  		noswap = true;
  
  	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
  		if (loop)
  			drain_all_stock_async(memcg);
  		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
  		/*
  		 * Allow limit shrinkers, which are triggered directly
  		 * by userspace, to catch signals and stop reclaim
  		 * after minimal progress, regardless of the margin.
  		 */
  		if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
  			break;
  		if (mem_cgroup_margin(memcg))
  			break;
  		/*
  		 * If nothing was reclaimed after two attempts, there
  		 * may be no reclaimable pages in this hierarchy.
  		 */
  		if (loop && !total)
  			break;
  	}
  	return total;
  }
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1493
1494
  /**
   * test_mem_cgroup_node_reclaimable
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1495
   * @memcg: the target memcg
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1496
1497
1498
1499
1500
1501
1502
   * @nid: the node ID to be checked.
   * @noswap : specify true here if the user wants flle only information.
   *
   * This function returns whether the specified memcg contains any
   * reclaimable pages on a node. Returns true if there are any reclaimable
   * pages in the node.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1503
  static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1504
1505
  		int nid, bool noswap)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1506
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1507
1508
1509
  		return true;
  	if (noswap || !total_swap_pages)
  		return false;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1510
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1511
1512
1513
1514
  		return true;
  	return false;
  
  }
889976dbc   Ying Han   memcg: reclaim me...
1515
1516
1517
1518
1519
1520
1521
1522
  #if MAX_NUMNODES > 1
  
  /*
   * Always updating the nodemask is not very good - even if we have an empty
   * list or the wrong list here, we can start from some node and traverse all
   * nodes based on the zonelist. So update the list loosely once per 10 secs.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1523
  static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1524
1525
  {
  	int nid;
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1526
1527
1528
1529
  	/*
  	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
  	 * pagein/pageout changes since the last update.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1530
  	if (!atomic_read(&memcg->numainfo_events))
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1531
  		return;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1532
  	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
889976dbc   Ying Han   memcg: reclaim me...
1533
  		return;
889976dbc   Ying Han   memcg: reclaim me...
1534
  	/* make a nodemask where this memcg uses memory from */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1535
  	memcg->scan_nodes = node_states[N_HIGH_MEMORY];
889976dbc   Ying Han   memcg: reclaim me...
1536
1537
  
  	for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1538
1539
  		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
  			node_clear(nid, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1540
  	}
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1541

c0ff4b854   Raghavendra K T   memcg: rename mem...
1542
1543
  	atomic_set(&memcg->numainfo_events, 0);
  	atomic_set(&memcg->numainfo_updating, 0);
889976dbc   Ying Han   memcg: reclaim me...
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
  }
  
  /*
   * Selecting a node where we start reclaim from. Because what we need is just
   * reducing usage counter, start from anywhere is O,K. Considering
   * memory reclaim from current node, there are pros. and cons.
   *
   * Freeing memory from current node means freeing memory from a node which
   * we'll use or we've used. So, it may make LRU bad. And if several threads
   * hit limits, it will see a contention on a node. But freeing from remote
   * node means more costs for memory reclaim because of memory latency.
   *
   * Now, we use round-robin. Better algorithm is welcomed.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1558
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1559
1560
  {
  	int node;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1561
1562
  	mem_cgroup_may_update_nodemask(memcg);
  	node = memcg->last_scanned_node;
889976dbc   Ying Han   memcg: reclaim me...
1563

c0ff4b854   Raghavendra K T   memcg: rename mem...
1564
  	node = next_node(node, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1565
  	if (node == MAX_NUMNODES)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1566
  		node = first_node(memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1567
1568
1569
1570
1571
1572
1573
1574
  	/*
  	 * We call this when we hit limit, not when pages are added to LRU.
  	 * No LRU may hold pages because all pages are UNEVICTABLE or
  	 * memcg is too small and all pages are not on LRU. In that case,
  	 * we use curret node.
  	 */
  	if (unlikely(node == MAX_NUMNODES))
  		node = numa_node_id();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1575
  	memcg->last_scanned_node = node;
889976dbc   Ying Han   memcg: reclaim me...
1576
1577
  	return node;
  }
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1578
1579
1580
1581
1582
1583
  /*
   * Check all nodes whether it contains reclaimable pages or not.
   * For quick scan, we make use of scan_nodes. This will allow us to skip
   * unused nodes. But scan_nodes is lazily updated and may not cotain
   * enough new information. We need to do double check.
   */
6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
1584
  static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1585
1586
1587
1588
1589
1590
1591
  {
  	int nid;
  
  	/*
  	 * quick check...making use of scan_node.
  	 * We can skip unused nodes.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1592
1593
  	if (!nodes_empty(memcg->scan_nodes)) {
  		for (nid = first_node(memcg->scan_nodes);
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1594
  		     nid < MAX_NUMNODES;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1595
  		     nid = next_node(nid, memcg->scan_nodes)) {
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1596

c0ff4b854   Raghavendra K T   memcg: rename mem...
1597
  			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1598
1599
1600
1601
1602
1603
1604
  				return true;
  		}
  	}
  	/*
  	 * Check rest of nodes.
  	 */
  	for_each_node_state(nid, N_HIGH_MEMORY) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1605
  		if (node_isset(nid, memcg->scan_nodes))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1606
  			continue;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1607
  		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1608
1609
1610
1611
  			return true;
  	}
  	return false;
  }
889976dbc   Ying Han   memcg: reclaim me...
1612
  #else
c0ff4b854   Raghavendra K T   memcg: rename mem...
1613
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1614
1615
1616
  {
  	return 0;
  }
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1617

6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
1618
  static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1619
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1620
  	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1621
  }
889976dbc   Ying Han   memcg: reclaim me...
1622
  #endif
5660048cc   Johannes Weiner   mm: move memcg hi...
1623
1624
1625
1626
  static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
  				   struct zone *zone,
  				   gfp_t gfp_mask,
  				   unsigned long *total_scanned)
6d61ef409   Balbir Singh   memcg: memory cgr...
1627
  {
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1628
  	struct mem_cgroup *victim = NULL;
5660048cc   Johannes Weiner   mm: move memcg hi...
1629
  	int total = 0;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1630
  	int loop = 0;
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1631
  	unsigned long excess;
185efc0f9   Johannes Weiner   memcg: Revert "me...
1632
  	unsigned long nr_scanned;
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1633
1634
1635
1636
  	struct mem_cgroup_reclaim_cookie reclaim = {
  		.zone = zone,
  		.priority = 0,
  	};
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1637

c0ff4b854   Raghavendra K T   memcg: rename mem...
1638
  	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1639

4e4169535   Balbir Singh   memory controller...
1640
  	while (1) {
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1641
  		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1642
  		if (!victim) {
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1643
  			loop++;
4e4169535   Balbir Singh   memory controller...
1644
1645
1646
1647
1648
1649
  			if (loop >= 2) {
  				/*
  				 * If we have not been able to reclaim
  				 * anything, it might because there are
  				 * no reclaimable pages under this hierarchy
  				 */
5660048cc   Johannes Weiner   mm: move memcg hi...
1650
  				if (!total)
4e4169535   Balbir Singh   memory controller...
1651
  					break;
4e4169535   Balbir Singh   memory controller...
1652
  				/*
25985edce   Lucas De Marchi   Fix common misspe...
1653
  				 * We want to do more targeted reclaim.
4e4169535   Balbir Singh   memory controller...
1654
1655
1656
1657
1658
  				 * excess >> 2 is not to excessive so as to
  				 * reclaim too much, nor too less that we keep
  				 * coming back to reclaim from this cgroup
  				 */
  				if (total >= (excess >> 2) ||
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1659
  					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
4e4169535   Balbir Singh   memory controller...
1660
  					break;
4e4169535   Balbir Singh   memory controller...
1661
  			}
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1662
  			continue;
4e4169535   Balbir Singh   memory controller...
1663
  		}
5660048cc   Johannes Weiner   mm: move memcg hi...
1664
  		if (!mem_cgroup_reclaimable(victim, false))
6d61ef409   Balbir Singh   memcg: memory cgr...
1665
  			continue;
5660048cc   Johannes Weiner   mm: move memcg hi...
1666
1667
1668
1669
  		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
  						     zone, &nr_scanned);
  		*total_scanned += nr_scanned;
  		if (!res_counter_soft_limit_excess(&root_memcg->res))
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1670
  			break;
6d61ef409   Balbir Singh   memcg: memory cgr...
1671
  	}
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1672
  	mem_cgroup_iter_break(root_memcg, victim);
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
1673
  	return total;
6d61ef409   Balbir Singh   memcg: memory cgr...
1674
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1675
1676
1677
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
1af8efe96   Michal Hocko   memcg: change mem...
1678
   * Has to be called with memcg_oom_lock
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1679
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1680
  static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1681
  {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1682
  	struct mem_cgroup *iter, *failed = NULL;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1683

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1684
  	for_each_mem_cgroup_tree(iter, memcg) {
23751be00   Johannes Weiner   memcg: fix hierar...
1685
  		if (iter->oom_lock) {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1686
1687
1688
1689
  			/*
  			 * this subtree of our hierarchy is already locked
  			 * so we cannot give a lock.
  			 */
79dfdaccd   Michal Hocko   memcg: make oom_l...
1690
  			failed = iter;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1691
1692
  			mem_cgroup_iter_break(memcg, iter);
  			break;
23751be00   Johannes Weiner   memcg: fix hierar...
1693
1694
  		} else
  			iter->oom_lock = true;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1695
  	}
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1696

79dfdaccd   Michal Hocko   memcg: make oom_l...
1697
  	if (!failed)
23751be00   Johannes Weiner   memcg: fix hierar...
1698
  		return true;
79dfdaccd   Michal Hocko   memcg: make oom_l...
1699
1700
1701
1702
1703
  
  	/*
  	 * OK, we failed to lock the whole subtree so we have to clean up
  	 * what we set up to the failing subtree
  	 */
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1704
  	for_each_mem_cgroup_tree(iter, memcg) {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1705
  		if (iter == failed) {
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1706
1707
  			mem_cgroup_iter_break(memcg, iter);
  			break;
79dfdaccd   Michal Hocko   memcg: make oom_l...
1708
1709
1710
  		}
  		iter->oom_lock = false;
  	}
23751be00   Johannes Weiner   memcg: fix hierar...
1711
  	return false;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1712
  }
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1713

79dfdaccd   Michal Hocko   memcg: make oom_l...
1714
  /*
1af8efe96   Michal Hocko   memcg: change mem...
1715
   * Has to be called with memcg_oom_lock
79dfdaccd   Michal Hocko   memcg: make oom_l...
1716
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1717
  static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1718
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1719
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1720
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1721
1722
1723
  		iter->oom_lock = false;
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1724
  static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1725
1726
  {
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1727
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1728
1729
  		atomic_inc(&iter->under_oom);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1730
  static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1731
1732
  {
  	struct mem_cgroup *iter;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1733
1734
1735
1736
1737
  	/*
  	 * When a new child is created while the hierarchy is under oom,
  	 * mem_cgroup_oom_lock() may not be called. We have to use
  	 * atomic_add_unless() here.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1738
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1739
  		atomic_add_unless(&iter->under_oom, -1, 0);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1740
  }
1af8efe96   Michal Hocko   memcg: change mem...
1741
  static DEFINE_SPINLOCK(memcg_oom_lock);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1742
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1743
  struct oom_wait_info {
d79154bb5   Hugh Dickins   memcg: replace me...
1744
  	struct mem_cgroup *memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1745
1746
1747
1748
1749
1750
  	wait_queue_t	wait;
  };
  
  static int memcg_oom_wake_function(wait_queue_t *wait,
  	unsigned mode, int sync, void *arg)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
1751
1752
  	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
  	struct mem_cgroup *oom_wait_memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1753
1754
1755
  	struct oom_wait_info *oom_wait_info;
  
  	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
d79154bb5   Hugh Dickins   memcg: replace me...
1756
  	oom_wait_memcg = oom_wait_info->memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1757

dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1758
  	/*
d79154bb5   Hugh Dickins   memcg: replace me...
1759
  	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1760
1761
  	 * Then we can use css_is_ancestor without taking care of RCU.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1762
1763
  	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
  		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1764
  		return 0;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1765
1766
  	return autoremove_wake_function(wait, mode, sync, arg);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1767
  static void memcg_wakeup_oom(struct mem_cgroup *memcg)
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1768
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1769
1770
  	/* for filtering, pass "memcg" as argument. */
  	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1771
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1772
  static void memcg_oom_recover(struct mem_cgroup *memcg)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1773
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1774
1775
  	if (memcg && atomic_read(&memcg->under_oom))
  		memcg_wakeup_oom(memcg);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1776
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1777
1778
1779
  /*
   * try to call OOM killer. returns false if we should exit memory-reclaim loop.
   */
6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
1780
1781
  static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
  				  int order)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1782
  {
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1783
  	struct oom_wait_info owait;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1784
  	bool locked, need_to_kill;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1785

d79154bb5   Hugh Dickins   memcg: replace me...
1786
  	owait.memcg = memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1787
1788
1789
1790
  	owait.wait.flags = 0;
  	owait.wait.func = memcg_oom_wake_function;
  	owait.wait.private = current;
  	INIT_LIST_HEAD(&owait.wait.task_list);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1791
  	need_to_kill = true;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1792
  	mem_cgroup_mark_under_oom(memcg);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1793

c0ff4b854   Raghavendra K T   memcg: rename mem...
1794
  	/* At first, try to OOM lock hierarchy under memcg.*/
1af8efe96   Michal Hocko   memcg: change mem...
1795
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1796
  	locked = mem_cgroup_oom_lock(memcg);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1797
1798
1799
1800
1801
  	/*
  	 * Even if signal_pending(), we can't quit charge() loop without
  	 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
  	 * under OOM is always welcomed, use TASK_KILLABLE here.
  	 */
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1802
  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1803
  	if (!locked || memcg->oom_kill_disable)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1804
1805
  		need_to_kill = false;
  	if (locked)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1806
  		mem_cgroup_oom_notify(memcg);
1af8efe96   Michal Hocko   memcg: change mem...
1807
  	spin_unlock(&memcg_oom_lock);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1808

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1809
1810
  	if (need_to_kill) {
  		finish_wait(&memcg_oom_waitq, &owait.wait);
e845e1993   David Rientjes   mm, memcg: pass c...
1811
  		mem_cgroup_out_of_memory(memcg, mask, order);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1812
  	} else {
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1813
  		schedule();
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1814
  		finish_wait(&memcg_oom_waitq, &owait.wait);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1815
  	}
1af8efe96   Michal Hocko   memcg: change mem...
1816
  	spin_lock(&memcg_oom_lock);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1817
  	if (locked)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1818
1819
  		mem_cgroup_oom_unlock(memcg);
  	memcg_wakeup_oom(memcg);
1af8efe96   Michal Hocko   memcg: change mem...
1820
  	spin_unlock(&memcg_oom_lock);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1821

c0ff4b854   Raghavendra K T   memcg: rename mem...
1822
  	mem_cgroup_unmark_under_oom(memcg);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1823

867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1824
1825
1826
  	if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
  		return false;
  	/* Give chance to dying process */
715a5ee82   KAMEZAWA Hiroyuki   memcg: fix oom sc...
1827
  	schedule_timeout_uninterruptible(1);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1828
  	return true;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1829
  }
d69b042f3   Balbir Singh   memcg: add file-b...
1830
1831
1832
  /*
   * Currently used to update mapped file statistics, but the routine can be
   * generalized to update other statistics as well.
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
   *
   * Notes: Race condition
   *
   * We usually use page_cgroup_lock() for accessing page_cgroup member but
   * it tends to be costly. But considering some conditions, we doesn't need
   * to do so _always_.
   *
   * Considering "charge", lock_page_cgroup() is not required because all
   * file-stat operations happen after a page is attached to radix-tree. There
   * are no race with "charge".
   *
   * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
   * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
   * if there are race with "uncharge". Statistics itself is properly handled
   * by flags.
   *
   * Considering "move", this is an only case we see a race. To make the race
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1850
1851
   * small, we check mm->moving_account and detect there are possibility of race
   * If there is, we take a lock.
d69b042f3   Balbir Singh   memcg: add file-b...
1852
   */
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1853

89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
  void __mem_cgroup_begin_update_page_stat(struct page *page,
  				bool *locked, unsigned long *flags)
  {
  	struct mem_cgroup *memcg;
  	struct page_cgroup *pc;
  
  	pc = lookup_page_cgroup(page);
  again:
  	memcg = pc->mem_cgroup;
  	if (unlikely(!memcg || !PageCgroupUsed(pc)))
  		return;
  	/*
  	 * If this memory cgroup is not under account moving, we don't
da92c47d0   Wanpeng Li   mm/memcg: replace...
1867
  	 * need to take move_lock_mem_cgroup(). Because we already hold
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1868
  	 * rcu_read_lock(), any calls to move_account will be delayed until
13fd1dd9d   Andrew Morton   mm/memcontrol.c: ...
1869
  	 * rcu_read_unlock() if mem_cgroup_stolen() == true.
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1870
  	 */
13fd1dd9d   Andrew Morton   mm/memcontrol.c: ...
1871
  	if (!mem_cgroup_stolen(memcg))
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
  		return;
  
  	move_lock_mem_cgroup(memcg, flags);
  	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
  		move_unlock_mem_cgroup(memcg, flags);
  		goto again;
  	}
  	*locked = true;
  }
  
  void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
  {
  	struct page_cgroup *pc = lookup_page_cgroup(page);
  
  	/*
  	 * It's guaranteed that pc->mem_cgroup never changes while
  	 * lock is held because a routine modifies pc->mem_cgroup
da92c47d0   Wanpeng Li   mm/memcg: replace...
1889
  	 * should take move_lock_mem_cgroup().
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1890
1891
1892
  	 */
  	move_unlock_mem_cgroup(pc->mem_cgroup, flags);
  }
2a7106f2c   Greg Thelen   memcg: create ext...
1893
1894
  void mem_cgroup_update_page_stat(struct page *page,
  				 enum mem_cgroup_page_stat_item idx, int val)
d69b042f3   Balbir Singh   memcg: add file-b...
1895
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1896
  	struct mem_cgroup *memcg;
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1897
  	struct page_cgroup *pc = lookup_page_cgroup(page);
dbd4ea78f   KAMEZAWA Hiroyuki   memcg: add lock t...
1898
  	unsigned long uninitialized_var(flags);
d69b042f3   Balbir Singh   memcg: add file-b...
1899

cfa449461   Johannes Weiner   mm: memcg: lookup...
1900
  	if (mem_cgroup_disabled())
d69b042f3   Balbir Singh   memcg: add file-b...
1901
  		return;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1902

c0ff4b854   Raghavendra K T   memcg: rename mem...
1903
1904
  	memcg = pc->mem_cgroup;
  	if (unlikely(!memcg || !PageCgroupUsed(pc)))
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1905
  		return;
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1906

26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1907
  	switch (idx) {
2a7106f2c   Greg Thelen   memcg: create ext...
1908
  	case MEMCG_NR_FILE_MAPPED:
2a7106f2c   Greg Thelen   memcg: create ext...
1909
  		idx = MEM_CGROUP_STAT_FILE_MAPPED;
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1910
1911
1912
  		break;
  	default:
  		BUG();
8725d5416   KAMEZAWA Hiroyuki   memcg: fix race i...
1913
  	}
d69b042f3   Balbir Singh   memcg: add file-b...
1914

c0ff4b854   Raghavendra K T   memcg: rename mem...
1915
  	this_cpu_add(memcg->stat->count[idx], val);
d69b042f3   Balbir Singh   memcg: add file-b...
1916
  }
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1917

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1918
  /*
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1919
1920
1921
   * size of first charge trial. "32" comes from vmscan.c's magic value.
   * TODO: maybe necessary to use big numbers in big irons.
   */
7ec99d621   Johannes Weiner   memcg: unify char...
1922
  #define CHARGE_BATCH	32U
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1923
1924
  struct memcg_stock_pcp {
  	struct mem_cgroup *cached; /* this never be root cgroup */
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1925
  	unsigned int nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1926
  	struct work_struct work;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1927
  	unsigned long flags;
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
1928
  #define FLUSHING_CACHED_CHARGE	0
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1929
1930
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
9f50fad65   Michal Hocko   Revert "memcg: ge...
1931
  static DEFINE_MUTEX(percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1932
1933
  
  /*
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1934
   * Try to consume stocked charge on this cpu. If success, one page is consumed
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1935
1936
1937
1938
   * from local stock and true is returned. If the stock is 0 or charges from a
   * cgroup which is not current target, returns false. This stock will be
   * refilled.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1939
  static bool consume_stock(struct mem_cgroup *memcg)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1940
1941
1942
1943
1944
  {
  	struct memcg_stock_pcp *stock;
  	bool ret = true;
  
  	stock = &get_cpu_var(memcg_stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1945
  	if (memcg == stock->cached && stock->nr_pages)
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1946
  		stock->nr_pages--;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
  	else /* need to call res_counter_charge */
  		ret = false;
  	put_cpu_var(memcg_stock);
  	return ret;
  }
  
  /*
   * Returns stocks cached in percpu to res_counter and reset cached information.
   */
  static void drain_stock(struct memcg_stock_pcp *stock)
  {
  	struct mem_cgroup *old = stock->cached;
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1959
1960
1961
1962
  	if (stock->nr_pages) {
  		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
  
  		res_counter_uncharge(&old->res, bytes);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1963
  		if (do_swap_account)
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1964
1965
  			res_counter_uncharge(&old->memsw, bytes);
  		stock->nr_pages = 0;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1966
1967
  	}
  	stock->cached = NULL;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
  }
  
  /*
   * This must be called under preempt disabled or must be called by
   * a thread which is pinned to local cpu.
   */
  static void drain_local_stock(struct work_struct *dummy)
  {
  	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
  	drain_stock(stock);
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1978
  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1979
1980
1981
1982
  }
  
  /*
   * Cache charges(val) which is from res_counter, to local per_cpu area.
320cc51d9   Greg Thelen   mm: fix typo in r...
1983
   * This will be consumed by consume_stock() function, later.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1984
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1985
  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1986
1987
  {
  	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1988
  	if (stock->cached != memcg) { /* reset if necessary */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1989
  		drain_stock(stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1990
  		stock->cached = memcg;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1991
  	}
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1992
  	stock->nr_pages += nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1993
1994
1995
1996
  	put_cpu_var(memcg_stock);
  }
  
  /*
c0ff4b854   Raghavendra K T   memcg: rename mem...
1997
   * Drains all per-CPU charge caches for given root_memcg resp. subtree
d38144b7a   Michal Hocko   memcg: unify sync...
1998
1999
   * of the hierarchy under it. sync flag says whether we should block
   * until the work is done.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2000
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2001
  static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2002
  {
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2003
  	int cpu, curcpu;
d38144b7a   Michal Hocko   memcg: unify sync...
2004

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2005
  	/* Notify other cpus that system-wide "drain" is running */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2006
  	get_online_cpus();
5af12d0ef   Johannes Weiner   memcg: pin execut...
2007
  	curcpu = get_cpu();
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2008
2009
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2010
  		struct mem_cgroup *memcg;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2011

c0ff4b854   Raghavendra K T   memcg: rename mem...
2012
2013
  		memcg = stock->cached;
  		if (!memcg || !stock->nr_pages)
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2014
  			continue;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2015
  		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
3e92041d6   Michal Hocko   memcg: add mem_cg...
2016
  			continue;
d1a05b697   Michal Hocko   memcg: do not try...
2017
2018
2019
2020
2021
2022
  		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
  			if (cpu == curcpu)
  				drain_local_stock(&stock->work);
  			else
  				schedule_work_on(cpu, &stock->work);
  		}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2023
  	}
5af12d0ef   Johannes Weiner   memcg: pin execut...
2024
  	put_cpu();
d38144b7a   Michal Hocko   memcg: unify sync...
2025
2026
2027
2028
2029
2030
  
  	if (!sync)
  		goto out;
  
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
9f50fad65   Michal Hocko   Revert "memcg: ge...
2031
  		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
d38144b7a   Michal Hocko   memcg: unify sync...
2032
2033
2034
  			flush_work(&stock->work);
  	}
  out:
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2035
   	put_online_cpus();
d38144b7a   Michal Hocko   memcg: unify sync...
2036
2037
2038
2039
2040
2041
2042
2043
  }
  
  /*
   * Tries to drain stocked charges in other cpus. This function is asynchronous
   * and just put a work per cpu for draining localy on each cpu. Caller can
   * expects some charges will be back to res_counter later but cannot wait for
   * it.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2044
  static void drain_all_stock_async(struct mem_cgroup *root_memcg)
d38144b7a   Michal Hocko   memcg: unify sync...
2045
  {
9f50fad65   Michal Hocko   Revert "memcg: ge...
2046
2047
2048
2049
2050
  	/*
  	 * If someone calls draining, avoid adding more kworker runs.
  	 */
  	if (!mutex_trylock(&percpu_charge_mutex))
  		return;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2051
  	drain_all_stock(root_memcg, false);
9f50fad65   Michal Hocko   Revert "memcg: ge...
2052
  	mutex_unlock(&percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2053
2054
2055
  }
  
  /* This is a synchronous drain interface. */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2056
  static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2057
2058
  {
  	/* called when force_empty is called */
9f50fad65   Michal Hocko   Revert "memcg: ge...
2059
  	mutex_lock(&percpu_charge_mutex);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2060
  	drain_all_stock(root_memcg, true);
9f50fad65   Michal Hocko   Revert "memcg: ge...
2061
  	mutex_unlock(&percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2062
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2063
2064
2065
2066
  /*
   * This function drains percpu counter value from DEAD cpu and
   * move it to local cpu. Note that this function can be preempted.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2067
  static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2068
2069
  {
  	int i;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2070
  	spin_lock(&memcg->pcp_counter_lock);
6104621de   Johannes Weiner   mm: memcg: remove...
2071
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2072
  		long x = per_cpu(memcg->stat->count[i], cpu);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2073

c0ff4b854   Raghavendra K T   memcg: rename mem...
2074
2075
  		per_cpu(memcg->stat->count[i], cpu) = 0;
  		memcg->nocpu_base.count[i] += x;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2076
  	}
e9f8974f2   Johannes Weiner   memcg: break out ...
2077
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2078
  		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
2079

c0ff4b854   Raghavendra K T   memcg: rename mem...
2080
2081
  		per_cpu(memcg->stat->events[i], cpu) = 0;
  		memcg->nocpu_base.events[i] += x;
e9f8974f2   Johannes Weiner   memcg: break out ...
2082
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2083
  	spin_unlock(&memcg->pcp_counter_lock);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2084
2085
2086
  }
  
  static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2087
2088
2089
2090
2091
  					unsigned long action,
  					void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
  	struct memcg_stock_pcp *stock;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2092
  	struct mem_cgroup *iter;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2093

619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
2094
  	if (action == CPU_ONLINE)
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2095
  		return NOTIFY_OK;
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2096

d833049bd   Kirill A. Shutemov   memcg: fix broken...
2097
  	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2098
  		return NOTIFY_OK;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2099

9f3a0d093   Johannes Weiner   mm: memcg: consol...
2100
  	for_each_mem_cgroup(iter)
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2101
  		mem_cgroup_drain_pcp_counter(iter, cpu);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2102
2103
2104
2105
  	stock = &per_cpu(memcg_stock, cpu);
  	drain_stock(stock);
  	return NOTIFY_OK;
  }
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2106
2107
2108
2109
2110
2111
2112
2113
2114
  
  /* See __mem_cgroup_try_charge() for details */
  enum {
  	CHARGE_OK,		/* success */
  	CHARGE_RETRY,		/* need to retry but retry is not bad */
  	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
  	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
  	CHARGE_OOM_DIE,		/* the current is killed because of OOM */
  };
c0ff4b854   Raghavendra K T   memcg: rename mem...
2115
  static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
7ec99d621   Johannes Weiner   memcg: unify char...
2116
  				unsigned int nr_pages, bool oom_check)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2117
  {
7ec99d621   Johannes Weiner   memcg: unify char...
2118
  	unsigned long csize = nr_pages * PAGE_SIZE;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2119
2120
2121
2122
  	struct mem_cgroup *mem_over_limit;
  	struct res_counter *fail_res;
  	unsigned long flags = 0;
  	int ret;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2123
  	ret = res_counter_charge(&memcg->res, csize, &fail_res);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2124
2125
2126
2127
  
  	if (likely(!ret)) {
  		if (!do_swap_account)
  			return CHARGE_OK;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2128
  		ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2129
2130
  		if (likely(!ret))
  			return CHARGE_OK;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2131
  		res_counter_uncharge(&memcg->res, csize);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2132
2133
2134
2135
  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
  		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
  	} else
  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
9221edb71   Johannes Weiner   memcg: prevent en...
2136
  	/*
7ec99d621   Johannes Weiner   memcg: unify char...
2137
2138
  	 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
  	 * of regular pages (CHARGE_BATCH), or a single regular page (1).
9221edb71   Johannes Weiner   memcg: prevent en...
2139
2140
2141
2142
  	 *
  	 * Never reclaim on behalf of optional batching, retry with a
  	 * single page instead.
  	 */
7ec99d621   Johannes Weiner   memcg: unify char...
2143
  	if (nr_pages == CHARGE_BATCH)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2144
2145
2146
2147
  		return CHARGE_RETRY;
  
  	if (!(gfp_mask & __GFP_WAIT))
  		return CHARGE_WOULDBLOCK;
5660048cc   Johannes Weiner   mm: move memcg hi...
2148
  	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
7ec99d621   Johannes Weiner   memcg: unify char...
2149
  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
19942822d   Johannes Weiner   memcg: prevent en...
2150
  		return CHARGE_RETRY;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2151
  	/*
19942822d   Johannes Weiner   memcg: prevent en...
2152
2153
2154
2155
2156
2157
2158
  	 * Even though the limit is exceeded at this point, reclaim
  	 * may have been able to free some pages.  Retry the charge
  	 * before killing the task.
  	 *
  	 * Only for regular pages, though: huge pages are rather
  	 * unlikely to succeed so close to the limit, and we fall back
  	 * to regular pages anyway in case of failure.
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2159
  	 */
7ec99d621   Johannes Weiner   memcg: unify char...
2160
  	if (nr_pages == 1 && ret)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
  		return CHARGE_RETRY;
  
  	/*
  	 * At task move, charge accounts can be doubly counted. So, it's
  	 * better to wait until the end of task_move if something is going on.
  	 */
  	if (mem_cgroup_wait_acct_move(mem_over_limit))
  		return CHARGE_RETRY;
  
  	/* If we don't need to call oom-killer at el, return immediately */
  	if (!oom_check)
  		return CHARGE_NOMEM;
  	/* check OOM */
e845e1993   David Rientjes   mm, memcg: pass c...
2174
  	if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2175
2176
2177
2178
  		return CHARGE_OOM_DIE;
  
  	return CHARGE_RETRY;
  }
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2179
  /*
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
   * __mem_cgroup_try_charge() does
   * 1. detect memcg to be charged against from passed *mm and *ptr,
   * 2. update res_counter
   * 3. call memory reclaim if necessary.
   *
   * In some special case, if the task is fatal, fatal_signal_pending() or
   * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
   * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
   * as possible without any hazards. 2: all pages should have a valid
   * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
   * pointer, that is treated as a charge to root_mem_cgroup.
   *
   * So __mem_cgroup_try_charge() will return
   *  0       ...  on success, filling *ptr with a valid memcg pointer.
   *  -ENOMEM ...  charge failure because of resource limits.
   *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
   *
   * Unlike the exported interface, an "oom" parameter is added. if oom==true,
   * the oom-killer can be invoked.
8a9f3ccd2   Balbir Singh   Memory controller...
2199
   */
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2200
  static int __mem_cgroup_try_charge(struct mm_struct *mm,
ec1685109   Andrea Arcangeli   thp: memcg compound
2201
  				   gfp_t gfp_mask,
7ec99d621   Johannes Weiner   memcg: unify char...
2202
  				   unsigned int nr_pages,
c0ff4b854   Raghavendra K T   memcg: rename mem...
2203
  				   struct mem_cgroup **ptr,
7ec99d621   Johannes Weiner   memcg: unify char...
2204
  				   bool oom)
8a9f3ccd2   Balbir Singh   Memory controller...
2205
  {
7ec99d621   Johannes Weiner   memcg: unify char...
2206
  	unsigned int batch = max(CHARGE_BATCH, nr_pages);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2207
  	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2208
  	struct mem_cgroup *memcg = NULL;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2209
  	int ret;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
2210

867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2211
2212
2213
2214
2215
2216
2217
2218
  	/*
  	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
  	 * in system level. So, allow to go ahead dying process in addition to
  	 * MEMDIE process.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE)
  		     || fatal_signal_pending(current)))
  		goto bypass;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
2219

8a9f3ccd2   Balbir Singh   Memory controller...
2220
  	/*
3be91277e   Hugh Dickins   memcgroup: tidy u...
2221
2222
  	 * We always charge the cgroup the mm_struct belongs to.
  	 * The mm_struct's mem_cgroup changes on task migration if the
8a9f3ccd2   Balbir Singh   Memory controller...
2223
  	 * thread group leader migrates. It's possible that mm is not
24467cacc   Johannes Weiner   mm: memcg: remove...
2224
  	 * set, if so charge the root memcg (happens for pagecache usage).
8a9f3ccd2   Balbir Singh   Memory controller...
2225
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2226
  	if (!*ptr && !mm)
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
2227
  		*ptr = root_mem_cgroup;
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2228
  again:
c0ff4b854   Raghavendra K T   memcg: rename mem...
2229
2230
2231
2232
  	if (*ptr) { /* css should be a valid one */
  		memcg = *ptr;
  		VM_BUG_ON(css_is_removed(&memcg->css));
  		if (mem_cgroup_is_root(memcg))
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2233
  			goto done;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2234
  		if (nr_pages == 1 && consume_stock(memcg))
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2235
  			goto done;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2236
  		css_get(&memcg->css);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2237
  	} else {
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2238
  		struct task_struct *p;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2239

f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2240
2241
  		rcu_read_lock();
  		p = rcu_dereference(mm->owner);
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2242
  		/*
ebb76ce16   KAMEZAWA Hiroyuki   memcg: fix wrong ...
2243
  		 * Because we don't have task_lock(), "p" can exit.
c0ff4b854   Raghavendra K T   memcg: rename mem...
2244
  		 * In that case, "memcg" can point to root or p can be NULL with
ebb76ce16   KAMEZAWA Hiroyuki   memcg: fix wrong ...
2245
2246
2247
2248
2249
2250
  		 * race with swapoff. Then, we have small risk of mis-accouning.
  		 * But such kind of mis-account by race always happens because
  		 * we don't have cgroup_mutex(). It's overkill and we allo that
  		 * small race, here.
  		 * (*) swapoff at el will charge against mm-struct not against
  		 * task-struct. So, mm->owner can be NULL.
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2251
  		 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2252
  		memcg = mem_cgroup_from_task(p);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
2253
2254
2255
  		if (!memcg)
  			memcg = root_mem_cgroup;
  		if (mem_cgroup_is_root(memcg)) {
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2256
2257
2258
  			rcu_read_unlock();
  			goto done;
  		}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2259
  		if (nr_pages == 1 && consume_stock(memcg)) {
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
  			/*
  			 * It seems dagerous to access memcg without css_get().
  			 * But considering how consume_stok works, it's not
  			 * necessary. If consume_stock success, some charges
  			 * from this memcg are cached on this cpu. So, we
  			 * don't need to call css_get()/css_tryget() before
  			 * calling consume_stock().
  			 */
  			rcu_read_unlock();
  			goto done;
  		}
  		/* after here, we may be blocked. we need to get refcnt */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2272
  		if (!css_tryget(&memcg->css)) {
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2273
2274
2275
2276
2277
  			rcu_read_unlock();
  			goto again;
  		}
  		rcu_read_unlock();
  	}
8a9f3ccd2   Balbir Singh   Memory controller...
2278

4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2279
2280
  	do {
  		bool oom_check;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2281

4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2282
  		/* If killed, bypass charge */
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2283
  		if (fatal_signal_pending(current)) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2284
  			css_put(&memcg->css);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2285
  			goto bypass;
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2286
  		}
6d61ef409   Balbir Singh   memcg: memory cgr...
2287

4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2288
2289
2290
2291
  		oom_check = false;
  		if (oom && !nr_oom_retries) {
  			oom_check = true;
  			nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2292
  		}
66e1707bc   Balbir Singh   Memory controller...
2293

c0ff4b854   Raghavendra K T   memcg: rename mem...
2294
  		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2295
2296
2297
2298
  		switch (ret) {
  		case CHARGE_OK:
  			break;
  		case CHARGE_RETRY: /* not in OOM situation but retry */
7ec99d621   Johannes Weiner   memcg: unify char...
2299
  			batch = nr_pages;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2300
2301
  			css_put(&memcg->css);
  			memcg = NULL;
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2302
  			goto again;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2303
  		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2304
  			css_put(&memcg->css);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2305
2306
  			goto nomem;
  		case CHARGE_NOMEM: /* OOM routine works */
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2307
  			if (!oom) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2308
  				css_put(&memcg->css);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2309
  				goto nomem;
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2310
  			}
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2311
2312
2313
2314
  			/* If oom, we never return -ENOMEM */
  			nr_oom_retries--;
  			break;
  		case CHARGE_OOM_DIE: /* Killed by OOM Killer */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2315
  			css_put(&memcg->css);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2316
  			goto bypass;
66e1707bc   Balbir Singh   Memory controller...
2317
  		}
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2318
  	} while (ret != CHARGE_OK);
7ec99d621   Johannes Weiner   memcg: unify char...
2319
  	if (batch > nr_pages)
c0ff4b854   Raghavendra K T   memcg: rename mem...
2320
2321
  		refill_stock(memcg, batch - nr_pages);
  	css_put(&memcg->css);
0c3e73e84   Balbir Singh   memcg: improve re...
2322
  done:
c0ff4b854   Raghavendra K T   memcg: rename mem...
2323
  	*ptr = memcg;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2324
2325
  	return 0;
  nomem:
c0ff4b854   Raghavendra K T   memcg: rename mem...
2326
  	*ptr = NULL;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2327
  	return -ENOMEM;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2328
  bypass:
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
2329
2330
  	*ptr = root_mem_cgroup;
  	return -EINTR;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2331
  }
8a9f3ccd2   Balbir Singh   Memory controller...
2332

a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2333
  /*
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2334
2335
2336
2337
   * Somemtimes we have to undo a charge we got by try_charge().
   * This function is for that and do uncharge, put css's refcnt.
   * gotten by try_charge().
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2338
  static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
e7018b8d2   Johannes Weiner   memcg: keep only ...
2339
  				       unsigned int nr_pages)
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2340
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2341
  	if (!mem_cgroup_is_root(memcg)) {
e7018b8d2   Johannes Weiner   memcg: keep only ...
2342
  		unsigned long bytes = nr_pages * PAGE_SIZE;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2343
  		res_counter_uncharge(&memcg->res, bytes);
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2344
  		if (do_swap_account)
c0ff4b854   Raghavendra K T   memcg: rename mem...
2345
  			res_counter_uncharge(&memcg->memsw, bytes);
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2346
  	}
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
2347
  }
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2348
  /*
d01dd17f1   KAMEZAWA Hiroyuki   memcg: use res_co...
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
   * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
   * This is useful when moving usage to parent cgroup.
   */
  static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
  					unsigned int nr_pages)
  {
  	unsigned long bytes = nr_pages * PAGE_SIZE;
  
  	if (mem_cgroup_is_root(memcg))
  		return;
  
  	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
  	if (do_swap_account)
  		res_counter_uncharge_until(&memcg->memsw,
  						memcg->memsw.parent, bytes);
  }
  
  /*
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
   * A helper function to get mem_cgroup from ID. must be called under
   * rcu_read_lock(). The caller must check css_is_removed() or some if
   * it's concern. (dropping refcnt from swap can be called against removed
   * memcg.)
   */
  static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
  {
  	struct cgroup_subsys_state *css;
  
  	/* ID 0 is unused ID */
  	if (!id)
  		return NULL;
  	css = css_lookup(&mem_cgroup_subsys, id);
  	if (!css)
  		return NULL;
b21451459   Wanpeng Li   memcg: add mem_cg...
2382
  	return mem_cgroup_from_css(css);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2383
  }
e42d9d5d4   Wu Fengguang   memcg: rename and...
2384
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2385
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2386
  	struct mem_cgroup *memcg = NULL;
3c776e646   Daisuke Nishimura   memcg: charge swa...
2387
  	struct page_cgroup *pc;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2388
  	unsigned short id;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2389
  	swp_entry_t ent;
3c776e646   Daisuke Nishimura   memcg: charge swa...
2390
  	VM_BUG_ON(!PageLocked(page));
3c776e646   Daisuke Nishimura   memcg: charge swa...
2391
  	pc = lookup_page_cgroup(page);
c0bd3f63c   Daisuke Nishimura   memcg: fix try_ge...
2392
  	lock_page_cgroup(pc);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2393
  	if (PageCgroupUsed(pc)) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2394
2395
2396
  		memcg = pc->mem_cgroup;
  		if (memcg && !css_tryget(&memcg->css))
  			memcg = NULL;
e42d9d5d4   Wu Fengguang   memcg: rename and...
2397
  	} else if (PageSwapCache(page)) {
3c776e646   Daisuke Nishimura   memcg: charge swa...
2398
  		ent.val = page_private(page);
9fb4b7cc0   Bob Liu   page_cgroup: add ...
2399
  		id = lookup_swap_cgroup_id(ent);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2400
  		rcu_read_lock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
2401
2402
2403
  		memcg = mem_cgroup_lookup(id);
  		if (memcg && !css_tryget(&memcg->css))
  			memcg = NULL;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2404
  		rcu_read_unlock();
3c776e646   Daisuke Nishimura   memcg: charge swa...
2405
  	}
c0bd3f63c   Daisuke Nishimura   memcg: fix try_ge...
2406
  	unlock_page_cgroup(pc);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2407
  	return memcg;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2408
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
2409
  static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
5564e88ba   Johannes Weiner   memcg: condense p...
2410
  				       struct page *page,
7ec99d621   Johannes Weiner   memcg: unify char...
2411
  				       unsigned int nr_pages,
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2412
2413
  				       enum charge_type ctype,
  				       bool lrucare)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2414
  {
ce587e65e   Johannes Weiner   mm: memcg: move p...
2415
  	struct page_cgroup *pc = lookup_page_cgroup(page);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2416
  	struct zone *uninitialized_var(zone);
fa9add641   Hugh Dickins   mm/memcg: apply a...
2417
  	struct lruvec *lruvec;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2418
  	bool was_on_lru = false;
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2419
  	bool anon;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2420

ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2421
  	lock_page_cgroup(pc);
90deb7883   Johannes Weiner   mm: memcg: only c...
2422
  	VM_BUG_ON(PageCgroupUsed(pc));
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2423
2424
2425
2426
  	/*
  	 * we don't need page_cgroup_lock about tail pages, becase they are not
  	 * accessed by any other context at this point.
  	 */
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2427
2428
2429
2430
2431
2432
2433
2434
2435
  
  	/*
  	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
  	 * may already be on some other mem_cgroup's LRU.  Take care of it.
  	 */
  	if (lrucare) {
  		zone = page_zone(page);
  		spin_lock_irq(&zone->lru_lock);
  		if (PageLRU(page)) {
fa9add641   Hugh Dickins   mm/memcg: apply a...
2436
  			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2437
  			ClearPageLRU(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
2438
  			del_page_from_lru_list(page, lruvec, page_lru(page));
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2439
2440
2441
  			was_on_lru = true;
  		}
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2442
  	pc->mem_cgroup = memcg;
261fb61a8   KAMEZAWA Hiroyuki   memcg: add commen...
2443
2444
2445
2446
2447
2448
2449
  	/*
  	 * We access a page_cgroup asynchronously without lock_page_cgroup().
  	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
  	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
  	 * before USED bit, we need memory barrier here.
  	 * See mem_cgroup_add_lru_list(), etc.
   	 */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2450
  	smp_wmb();
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2451
  	SetPageCgroupUsed(pc);
3be91277e   Hugh Dickins   memcgroup: tidy u...
2452

9ce70c024   Hugh Dickins   memcg: fix deadlo...
2453
2454
  	if (lrucare) {
  		if (was_on_lru) {
fa9add641   Hugh Dickins   mm/memcg: apply a...
2455
  			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2456
2457
  			VM_BUG_ON(PageLRU(page));
  			SetPageLRU(page);
fa9add641   Hugh Dickins   mm/memcg: apply a...
2458
  			add_page_to_lru_list(page, lruvec, page_lru(page));
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2459
2460
2461
  		}
  		spin_unlock_irq(&zone->lru_lock);
  	}
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
2462
  	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2463
2464
2465
2466
2467
  		anon = true;
  	else
  		anon = false;
  
  	mem_cgroup_charge_statistics(memcg, anon, nr_pages);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2468
  	unlock_page_cgroup(pc);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2469

430e48631   KAMEZAWA Hiroyuki   memcg: update thr...
2470
2471
2472
2473
2474
  	/*
  	 * "charge_statistics" updated event counter. Then, check it.
  	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
  	 * if they exceeds softlimit.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2475
  	memcg_check_events(memcg, page);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2476
  }
66e1707bc   Balbir Singh   Memory controller...
2477

ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2478
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
2479
  #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2480
2481
  /*
   * Because tail pages are not marked as "used", set it. We're under
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2482
2483
2484
   * zone->lru_lock, 'splitting on pmd' and compound_lock.
   * charge/uncharge will be never happen and move_account() is done under
   * compound_lock(), so we don't have to take care of races.
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2485
   */
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2486
  void mem_cgroup_split_huge_fixup(struct page *head)
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2487
2488
  {
  	struct page_cgroup *head_pc = lookup_page_cgroup(head);
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2489
2490
  	struct page_cgroup *pc;
  	int i;
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2491

3d37c4a91   KAMEZAWA Hiroyuki   memcg: bugfix che...
2492
2493
  	if (mem_cgroup_disabled())
  		return;
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2494
2495
2496
2497
  	for (i = 1; i < HPAGE_PMD_NR; i++) {
  		pc = head_pc + i;
  		pc->mem_cgroup = head_pc->mem_cgroup;
  		smp_wmb();/* see __commit_charge() */
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2498
2499
  		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
  	}
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2500
  }
12d271078   Hugh Dickins   memcg: fix split_...
2501
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2502

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2503
  /**
de3638d9c   Johannes Weiner   memcg: fold __mem...
2504
   * mem_cgroup_move_account - move account of the page
5564e88ba   Johannes Weiner   memcg: condense p...
2505
   * @page: the page
7ec99d621   Johannes Weiner   memcg: unify char...
2506
   * @nr_pages: number of regular pages (>1 for huge pages)
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2507
2508
2509
2510
2511
   * @pc:	page_cgroup of the page.
   * @from: mem_cgroup which the page is moved from.
   * @to:	mem_cgroup which the page is moved to. @from != @to.
   *
   * The caller must confirm following.
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2512
   * - page is not on LRU (isolate_page() is useful.)
7ec99d621   Johannes Weiner   memcg: unify char...
2513
   * - compound_lock is held when nr_pages > 1
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2514
   *
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
2515
2516
   * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
   * from old cgroup.
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2517
   */
7ec99d621   Johannes Weiner   memcg: unify char...
2518
2519
2520
2521
  static int mem_cgroup_move_account(struct page *page,
  				   unsigned int nr_pages,
  				   struct page_cgroup *pc,
  				   struct mem_cgroup *from,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
2522
  				   struct mem_cgroup *to)
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2523
  {
de3638d9c   Johannes Weiner   memcg: fold __mem...
2524
2525
  	unsigned long flags;
  	int ret;
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2526
  	bool anon = PageAnon(page);
987eba66e   KAMEZAWA Hiroyuki   memcg: fix rmdir,...
2527

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2528
  	VM_BUG_ON(from == to);
5564e88ba   Johannes Weiner   memcg: condense p...
2529
  	VM_BUG_ON(PageLRU(page));
de3638d9c   Johannes Weiner   memcg: fold __mem...
2530
2531
2532
2533
2534
2535
2536
  	/*
  	 * The page is isolated from LRU. So, collapse function
  	 * will not handle this page. But page splitting can happen.
  	 * Do this check under compound_page_lock(). The caller should
  	 * hold it.
  	 */
  	ret = -EBUSY;
7ec99d621   Johannes Weiner   memcg: unify char...
2537
  	if (nr_pages > 1 && !PageTransHuge(page))
de3638d9c   Johannes Weiner   memcg: fold __mem...
2538
2539
2540
2541
2542
2543
2544
  		goto out;
  
  	lock_page_cgroup(pc);
  
  	ret = -EINVAL;
  	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
  		goto unlock;
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
2545
  	move_lock_mem_cgroup(from, &flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2546

2ff76f119   KAMEZAWA Hiroyuki   memcg: remove PCG...
2547
  	if (!anon && page_mapped(page)) {
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
2548
2549
2550
2551
2552
  		/* Update mapped_file data for mem_cgroup */
  		preempt_disable();
  		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
  		preempt_enable();
d69b042f3   Balbir Singh   memcg: add file-b...
2553
  	}
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2554
  	mem_cgroup_charge_statistics(from, anon, -nr_pages);
d69b042f3   Balbir Singh   memcg: add file-b...
2555

854ffa8d1   Daisuke Nishimura   memcg: improve pe...
2556
  	/* caller should have done css_get */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2557
  	pc->mem_cgroup = to;
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2558
  	mem_cgroup_charge_statistics(to, anon, nr_pages);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2559
2560
2561
  	/*
  	 * We charges against "to" which may not have any tasks. Then, "to"
  	 * can be under rmdir(). But in current implementation, caller of
4ffef5fef   Daisuke Nishimura   memcg: move charg...
2562
  	 * this function is just force_empty() and move charge, so it's
25985edce   Lucas De Marchi   Fix common misspe...
2563
  	 * guaranteed that "to" is never removed. So, we don't check rmdir
4ffef5fef   Daisuke Nishimura   memcg: move charg...
2564
  	 * status here.
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2565
  	 */
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
2566
  	move_unlock_mem_cgroup(from, &flags);
de3638d9c   Johannes Weiner   memcg: fold __mem...
2567
2568
  	ret = 0;
  unlock:
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
2569
  	unlock_page_cgroup(pc);
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
2570
2571
2572
  	/*
  	 * check events
  	 */
5564e88ba   Johannes Weiner   memcg: condense p...
2573
2574
  	memcg_check_events(to, page);
  	memcg_check_events(from, page);
de3638d9c   Johannes Weiner   memcg: fold __mem...
2575
  out:
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2576
2577
2578
2579
2580
2581
  	return ret;
  }
  
  /*
   * move charges to its parent.
   */
5564e88ba   Johannes Weiner   memcg: condense p...
2582
2583
  static int mem_cgroup_move_parent(struct page *page,
  				  struct page_cgroup *pc,
6068bf010   KAMEZAWA Hiroyuki   memcg: mem_cgroup...
2584
  				  struct mem_cgroup *child)
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2585
  {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2586
  	struct mem_cgroup *parent;
7ec99d621   Johannes Weiner   memcg: unify char...
2587
  	unsigned int nr_pages;
4be4489fe   Andrew Morton   mm/memcontrol.c: ...
2588
  	unsigned long uninitialized_var(flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2589
2590
2591
  	int ret;
  
  	/* Is ROOT ? */
cc926f784   KAMEZAWA Hiroyuki   memcg: move charg...
2592
  	if (mem_cgroup_is_root(child))
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2593
  		return -EINVAL;
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
2594
2595
2596
2597
2598
  	ret = -EBUSY;
  	if (!get_page_unless_zero(page))
  		goto out;
  	if (isolate_lru_page(page))
  		goto put;
52dbb9050   KAMEZAWA Hiroyuki   memcg: fix race a...
2599

7ec99d621   Johannes Weiner   memcg: unify char...
2600
  	nr_pages = hpage_nr_pages(page);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2601

cc926f784   KAMEZAWA Hiroyuki   memcg: move charg...
2602
2603
2604
2605
2606
2607
  	parent = parent_mem_cgroup(child);
  	/*
  	 * If no parent, move charges to root cgroup.
  	 */
  	if (!parent)
  		parent = root_mem_cgroup;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2608

7ec99d621   Johannes Weiner   memcg: unify char...
2609
  	if (nr_pages > 1)
987eba66e   KAMEZAWA Hiroyuki   memcg: fix rmdir,...
2610
  		flags = compound_lock_irqsave(page);
cc926f784   KAMEZAWA Hiroyuki   memcg: move charg...
2611
  	ret = mem_cgroup_move_account(page, nr_pages,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
2612
  				pc, child, parent);
cc926f784   KAMEZAWA Hiroyuki   memcg: move charg...
2613
2614
  	if (!ret)
  		__mem_cgroup_cancel_local_charge(child, nr_pages);
8dba474f0   Jesper Juhl   mm/memcontrol.c: ...
2615

7ec99d621   Johannes Weiner   memcg: unify char...
2616
  	if (nr_pages > 1)
987eba66e   KAMEZAWA Hiroyuki   memcg: fix rmdir,...
2617
  		compound_unlock_irqrestore(page, flags);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
2618
  	putback_lru_page(page);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
2619
  put:
40d58138f   Daisuke Nishimura   memcg: fix error ...
2620
  	put_page(page);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
2621
  out:
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2622
2623
  	return ret;
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2624
2625
2626
2627
2628
2629
2630
  /*
   * Charge the memory controller for page usage.
   * Return
   * 0 if the charge was successful
   * < 0 if the cgroup is over its limit
   */
  static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
73045c47b   Daisuke Nishimura   memcg: remove mem...
2631
  				gfp_t gfp_mask, enum charge_type ctype)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2632
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2633
  	struct mem_cgroup *memcg = NULL;
7ec99d621   Johannes Weiner   memcg: unify char...
2634
  	unsigned int nr_pages = 1;
8493ae439   Johannes Weiner   memcg: never OOM ...
2635
  	bool oom = true;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2636
  	int ret;
ec1685109   Andrea Arcangeli   thp: memcg compound
2637

37c2ac787   Andrea Arcangeli   thp: compound_tra...
2638
  	if (PageTransHuge(page)) {
7ec99d621   Johannes Weiner   memcg: unify char...
2639
  		nr_pages <<= compound_order(page);
37c2ac787   Andrea Arcangeli   thp: compound_tra...
2640
  		VM_BUG_ON(!PageTransHuge(page));
8493ae439   Johannes Weiner   memcg: never OOM ...
2641
2642
2643
2644
2645
  		/*
  		 * Never OOM-kill a process for a huge page.  The
  		 * fault handler will fall back to regular pages.
  		 */
  		oom = false;
37c2ac787   Andrea Arcangeli   thp: compound_tra...
2646
  	}
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2647

c0ff4b854   Raghavendra K T   memcg: rename mem...
2648
  	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
2649
  	if (ret == -ENOMEM)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2650
  		return ret;
ce587e65e   Johannes Weiner   mm: memcg: move p...
2651
  	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
8a9f3ccd2   Balbir Singh   Memory controller...
2652
  	return 0;
8a9f3ccd2   Balbir Singh   Memory controller...
2653
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2654
2655
  int mem_cgroup_newpage_charge(struct page *page,
  			      struct mm_struct *mm, gfp_t gfp_mask)
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
2656
  {
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2657
  	if (mem_cgroup_disabled())
cede86acd   Li Zefan   memcg: clean up c...
2658
  		return 0;
7a0524cfc   Johannes Weiner   mm: memcg: remove...
2659
2660
2661
  	VM_BUG_ON(page_mapped(page));
  	VM_BUG_ON(page->mapping && !PageAnon(page));
  	VM_BUG_ON(!mm);
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
2662
  	return mem_cgroup_charge_common(page, mm, gfp_mask,
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
2663
  					MEM_CGROUP_CHARGE_TYPE_ANON);
217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
2664
  }
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2665
2666
2667
  /*
   * While swap-in, try_charge -> commit or cancel, the page is locked.
   * And when try_charge() successfully returns, one refcnt to memcg without
21ae2956c   Uwe Kleine-König   tree-wide: fix ty...
2668
   * struct page_cgroup is acquired. This refcnt will be consumed by
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2669
2670
   * "commit()" or removed by "cancel()"
   */
0435a2fdc   Johannes Weiner   mm: memcg: split ...
2671
2672
2673
2674
  static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
  					  struct page *page,
  					  gfp_t mask,
  					  struct mem_cgroup **memcgp)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2675
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2676
  	struct mem_cgroup *memcg;
90deb7883   Johannes Weiner   mm: memcg: only c...
2677
  	struct page_cgroup *pc;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2678
  	int ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2679

90deb7883   Johannes Weiner   mm: memcg: only c...
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
  	pc = lookup_page_cgroup(page);
  	/*
  	 * Every swap fault against a single page tries to charge the
  	 * page, bail as early as possible.  shmem_unuse() encounters
  	 * already charged pages, too.  The USED bit is protected by
  	 * the page lock, which serializes swap cache removal, which
  	 * in turn serializes uncharging.
  	 */
  	if (PageCgroupUsed(pc))
  		return 0;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2690
2691
  	if (!do_swap_account)
  		goto charge_cur_mm;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2692
2693
  	memcg = try_get_mem_cgroup_from_page(page);
  	if (!memcg)
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2694
  		goto charge_cur_mm;
72835c86c   Johannes Weiner   mm: unify remaini...
2695
2696
  	*memcgp = memcg;
  	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2697
  	css_put(&memcg->css);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
2698
2699
  	if (ret == -EINTR)
  		ret = 0;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
2700
  	return ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2701
  charge_cur_mm:
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
2702
2703
2704
2705
  	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
  	if (ret == -EINTR)
  		ret = 0;
  	return ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2706
  }
0435a2fdc   Johannes Weiner   mm: memcg: split ...
2707
2708
2709
2710
2711
2712
  int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
  				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
  {
  	*memcgp = NULL;
  	if (mem_cgroup_disabled())
  		return 0;
bdf4f4d21   Johannes Weiner   mm: memcg: only c...
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
  	/*
  	 * A racing thread's fault, or swapoff, may have already
  	 * updated the pte, and even removed page from swap cache: in
  	 * those cases unuse_pte()'s pte_same() test will fail; but
  	 * there's also a KSM case which does need to charge the page.
  	 */
  	if (!PageSwapCache(page)) {
  		int ret;
  
  		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
  		if (ret == -EINTR)
  			ret = 0;
  		return ret;
  	}
0435a2fdc   Johannes Weiner   mm: memcg: split ...
2727
2728
  	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
  }
827a03d22   Johannes Weiner   mm: memcg: move s...
2729
2730
2731
2732
2733
2734
2735
2736
  void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
  {
  	if (mem_cgroup_disabled())
  		return;
  	if (!memcg)
  		return;
  	__mem_cgroup_cancel_charge(memcg, 1);
  }
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
2737
  static void
72835c86c   Johannes Weiner   mm: unify remaini...
2738
  __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
2739
  					enum charge_type ctype)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2740
  {
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2741
  	if (mem_cgroup_disabled())
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2742
  		return;
72835c86c   Johannes Weiner   mm: unify remaini...
2743
  	if (!memcg)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2744
  		return;
72835c86c   Johannes Weiner   mm: unify remaini...
2745
  	cgroup_exclude_rmdir(&memcg->css);
5a6475a4e   KAMEZAWA Hiroyuki   memcg: fix leak o...
2746

ce587e65e   Johannes Weiner   mm: memcg: move p...
2747
  	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2748
2749
2750
  	/*
  	 * Now swap is on-memory. This means this page may be
  	 * counted both as mem and swap....double count.
03f3c4336   KAMEZAWA Hiroyuki   memcg: fix swap a...
2751
2752
2753
  	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
  	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
  	 * may call delete_from_swap_cache() before reach here.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2754
  	 */
03f3c4336   KAMEZAWA Hiroyuki   memcg: fix swap a...
2755
  	if (do_swap_account && PageSwapCache(page)) {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2756
  		swp_entry_t ent = {.val = page_private(page)};
86493009d   Hugh Dickins   memcg swap: use m...
2757
  		mem_cgroup_uncharge_swap(ent);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2758
  	}
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
2759
2760
2761
2762
2763
  	/*
  	 * At swapin, we may charge account against cgroup which has no tasks.
  	 * So, rmdir()->pre_destroy() can be called while we do this charge.
  	 * In that case, we need to call pre_destroy() again. check it here.
  	 */
72835c86c   Johannes Weiner   mm: unify remaini...
2764
  	cgroup_release_and_wakeup_rmdir(&memcg->css);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2765
  }
72835c86c   Johannes Weiner   mm: unify remaini...
2766
2767
  void mem_cgroup_commit_charge_swapin(struct page *page,
  				     struct mem_cgroup *memcg)
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
2768
  {
72835c86c   Johannes Weiner   mm: unify remaini...
2769
  	__mem_cgroup_commit_charge_swapin(page, memcg,
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
2770
  					  MEM_CGROUP_CHARGE_TYPE_ANON);
83aae4c73   Daisuke Nishimura   memcg: cleanup ca...
2771
  }
827a03d22   Johannes Weiner   mm: memcg: move s...
2772
2773
  int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
  				gfp_t gfp_mask)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2774
  {
827a03d22   Johannes Weiner   mm: memcg: move s...
2775
2776
2777
  	struct mem_cgroup *memcg = NULL;
  	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
  	int ret;
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2778
  	if (mem_cgroup_disabled())
827a03d22   Johannes Weiner   mm: memcg: move s...
2779
2780
2781
  		return 0;
  	if (PageCompound(page))
  		return 0;
827a03d22   Johannes Weiner   mm: memcg: move s...
2782
2783
2784
  	if (!PageSwapCache(page))
  		ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
  	else { /* page is swapcache/shmem */
0435a2fdc   Johannes Weiner   mm: memcg: split ...
2785
2786
  		ret = __mem_cgroup_try_charge_swapin(mm, page,
  						     gfp_mask, &memcg);
827a03d22   Johannes Weiner   mm: memcg: move s...
2787
2788
2789
2790
  		if (!ret)
  			__mem_cgroup_commit_charge_swapin(page, memcg, type);
  	}
  	return ret;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2791
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
2792
  static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
7ec99d621   Johannes Weiner   memcg: unify char...
2793
2794
  				   unsigned int nr_pages,
  				   const enum charge_type ctype)
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2795
2796
2797
  {
  	struct memcg_batch_info *batch = NULL;
  	bool uncharge_memsw = true;
7ec99d621   Johannes Weiner   memcg: unify char...
2798

569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2799
2800
2801
  	/* If swapout, usage of swap doesn't decrease */
  	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
  		uncharge_memsw = false;
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2802
2803
2804
2805
2806
2807
2808
2809
  
  	batch = &current->memcg_batch;
  	/*
  	 * In usual, we do css_get() when we remember memcg pointer.
  	 * But in this case, we keep res->usage until end of a series of
  	 * uncharges. Then, it's ok to ignore memcg's refcnt.
  	 */
  	if (!batch->memcg)
c0ff4b854   Raghavendra K T   memcg: rename mem...
2810
  		batch->memcg = memcg;
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2811
  	/*
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2812
  	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
25985edce   Lucas De Marchi   Fix common misspe...
2813
  	 * In those cases, all pages freed continuously can be expected to be in
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2814
2815
2816
2817
2818
2819
2820
  	 * the same cgroup and we have chance to coalesce uncharges.
  	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
  	 * because we want to do uncharge as soon as possible.
  	 */
  
  	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
  		goto direct_uncharge;
7ec99d621   Johannes Weiner   memcg: unify char...
2821
  	if (nr_pages > 1)
ec1685109   Andrea Arcangeli   thp: memcg compound
2822
  		goto direct_uncharge;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2823
  	/*
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2824
2825
2826
2827
  	 * In typical case, batch->memcg == mem. This means we can
  	 * merge a series of uncharges to an uncharge of res_counter.
  	 * If not, we uncharge res_counter ony by one.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2828
  	if (batch->memcg != memcg)
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2829
2830
  		goto direct_uncharge;
  	/* remember freed charge and uncharge it later */
7ffd4ca7a   Johannes Weiner   memcg: convert un...
2831
  	batch->nr_pages++;
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2832
  	if (uncharge_memsw)
7ffd4ca7a   Johannes Weiner   memcg: convert un...
2833
  		batch->memsw_nr_pages++;
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2834
2835
  	return;
  direct_uncharge:
c0ff4b854   Raghavendra K T   memcg: rename mem...
2836
  	res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2837
  	if (uncharge_memsw)
c0ff4b854   Raghavendra K T   memcg: rename mem...
2838
2839
2840
  		res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
  	if (unlikely(batch->memcg != memcg))
  		memcg_oom_recover(memcg);
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2841
  }
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2842

8697d3319   Balbir Singh   Memory controller...
2843
  /*
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2844
   * uncharge if !page_mapped(page)
8a9f3ccd2   Balbir Singh   Memory controller...
2845
   */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2846
  static struct mem_cgroup *
0030f535a   Johannes Weiner   mm: memcg: fix co...
2847
2848
  __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
  			     bool end_migration)
8a9f3ccd2   Balbir Singh   Memory controller...
2849
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2850
  	struct mem_cgroup *memcg = NULL;
7ec99d621   Johannes Weiner   memcg: unify char...
2851
2852
  	unsigned int nr_pages = 1;
  	struct page_cgroup *pc;
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2853
  	bool anon;
8a9f3ccd2   Balbir Singh   Memory controller...
2854

f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
2855
  	if (mem_cgroup_disabled())
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2856
  		return NULL;
4077960e2   Balbir Singh   memory controller...
2857

0c59b89c8   Johannes Weiner   mm: memcg: push d...
2858
  	VM_BUG_ON(PageSwapCache(page));
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2859

37c2ac787   Andrea Arcangeli   thp: compound_tra...
2860
  	if (PageTransHuge(page)) {
7ec99d621   Johannes Weiner   memcg: unify char...
2861
  		nr_pages <<= compound_order(page);
37c2ac787   Andrea Arcangeli   thp: compound_tra...
2862
2863
  		VM_BUG_ON(!PageTransHuge(page));
  	}
8697d3319   Balbir Singh   Memory controller...
2864
  	/*
3c541e14b   Balbir Singh   Memory controller...
2865
  	 * Check if our page_cgroup is valid
8697d3319   Balbir Singh   Memory controller...
2866
  	 */
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2867
  	pc = lookup_page_cgroup(page);
cfa449461   Johannes Weiner   mm: memcg: lookup...
2868
  	if (unlikely(!PageCgroupUsed(pc)))
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2869
  		return NULL;
b9c565d5a   Hugh Dickins   memcg: remove cle...
2870

52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2871
  	lock_page_cgroup(pc);
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2872

c0ff4b854   Raghavendra K T   memcg: rename mem...
2873
  	memcg = pc->mem_cgroup;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2874

d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2875
2876
  	if (!PageCgroupUsed(pc))
  		goto unlock_out;
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2877
  	anon = PageAnon(page);
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2878
  	switch (ctype) {
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
2879
  	case MEM_CGROUP_CHARGE_TYPE_ANON:
2ff76f119   KAMEZAWA Hiroyuki   memcg: remove PCG...
2880
2881
2882
2883
2884
  		/*
  		 * Generally PageAnon tells if it's the anon statistics to be
  		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
  		 * used before page reached the stage of being marked PageAnon.
  		 */
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2885
2886
  		anon = true;
  		/* fallthrough */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
2887
  	case MEM_CGROUP_CHARGE_TYPE_DROP:
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
2888
  		/* See mem_cgroup_prepare_migration() */
0030f535a   Johannes Weiner   mm: memcg: fix co...
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
  		if (page_mapped(page))
  			goto unlock_out;
  		/*
  		 * Pages under migration may not be uncharged.  But
  		 * end_migration() /must/ be the one uncharging the
  		 * unused post-migration page and so it has to call
  		 * here with the migration bit still set.  See the
  		 * res_counter handling below.
  		 */
  		if (!end_migration && PageCgroupMigration(pc))
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
  			goto unlock_out;
  		break;
  	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
  		if (!PageAnon(page)) {	/* Shared memory */
  			if (page->mapping && !page_is_file_cache(page))
  				goto unlock_out;
  		} else if (page_mapped(page)) /* Anon */
  				goto unlock_out;
  		break;
  	default:
  		break;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2910
  	}
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2911

b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
2912
  	mem_cgroup_charge_statistics(memcg, anon, -nr_pages);
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
2913

52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2914
  	ClearPageCgroupUsed(pc);
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
2915
2916
2917
2918
2919
2920
  	/*
  	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
  	 * freed from LRU. This is safe because uncharged page is expected not
  	 * to be reused (freed soon). Exception is SwapCache, it's handled by
  	 * special functions.
  	 */
b9c565d5a   Hugh Dickins   memcg: remove cle...
2921

52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2922
  	unlock_page_cgroup(pc);
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2923
  	/*
c0ff4b854   Raghavendra K T   memcg: rename mem...
2924
  	 * even after unlock, we have memcg->res.usage here and this memcg
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2925
2926
  	 * will never be freed.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2927
  	memcg_check_events(memcg, page);
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2928
  	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2929
2930
  		mem_cgroup_swap_statistics(memcg, true);
  		mem_cgroup_get(memcg);
f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
2931
  	}
0030f535a   Johannes Weiner   mm: memcg: fix co...
2932
2933
2934
2935
2936
2937
  	/*
  	 * Migration does not charge the res_counter for the
  	 * replacement page, so leave it alone when phasing out the
  	 * page that is unused after the migration.
  	 */
  	if (!end_migration && !mem_cgroup_is_root(memcg))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2938
  		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
2939

c0ff4b854   Raghavendra K T   memcg: rename mem...
2940
  	return memcg;
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2941
2942
2943
  
  unlock_out:
  	unlock_page_cgroup(pc);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2944
  	return NULL;
3c541e14b   Balbir Singh   Memory controller...
2945
  }
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2946
2947
  void mem_cgroup_uncharge_page(struct page *page)
  {
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
2948
2949
2950
  	/* early check. */
  	if (page_mapped(page))
  		return;
40f23a21a   Johannes Weiner   mm: memcg: remove...
2951
  	VM_BUG_ON(page->mapping && !PageAnon(page));
0c59b89c8   Johannes Weiner   mm: memcg: push d...
2952
2953
  	if (PageSwapCache(page))
  		return;
0030f535a   Johannes Weiner   mm: memcg: fix co...
2954
  	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2955
2956
2957
2958
2959
  }
  
  void mem_cgroup_uncharge_cache_page(struct page *page)
  {
  	VM_BUG_ON(page_mapped(page));
b7abea963   KAMEZAWA Hiroyuki   memcg: make page-...
2960
  	VM_BUG_ON(page->mapping);
0030f535a   Johannes Weiner   mm: memcg: fix co...
2961
  	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
2962
  }
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
  /*
   * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
   * In that cases, pages are freed continuously and we can expect pages
   * are in the same memcg. All these calls itself limits the number of
   * pages freed at once, then uncharge_start/end() is called properly.
   * This may be called prural(2) times in a context,
   */
  
  void mem_cgroup_uncharge_start(void)
  {
  	current->memcg_batch.do_batch++;
  	/* We can do nest. */
  	if (current->memcg_batch.do_batch == 1) {
  		current->memcg_batch.memcg = NULL;
7ffd4ca7a   Johannes Weiner   memcg: convert un...
2977
2978
  		current->memcg_batch.nr_pages = 0;
  		current->memcg_batch.memsw_nr_pages = 0;
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
  	}
  }
  
  void mem_cgroup_uncharge_end(void)
  {
  	struct memcg_batch_info *batch = &current->memcg_batch;
  
  	if (!batch->do_batch)
  		return;
  
  	batch->do_batch--;
  	if (batch->do_batch) /* If stacked, do nothing. */
  		return;
  
  	if (!batch->memcg)
  		return;
  	/*
  	 * This "batch->memcg" is valid without any css_get/put etc...
  	 * bacause we hide charges behind us.
  	 */
7ffd4ca7a   Johannes Weiner   memcg: convert un...
2999
3000
3001
3002
3003
3004
  	if (batch->nr_pages)
  		res_counter_uncharge(&batch->memcg->res,
  				     batch->nr_pages * PAGE_SIZE);
  	if (batch->memsw_nr_pages)
  		res_counter_uncharge(&batch->memcg->memsw,
  				     batch->memsw_nr_pages * PAGE_SIZE);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3005
  	memcg_oom_recover(batch->memcg);
569b846df   KAMEZAWA Hiroyuki   memcg: coalesce u...
3006
3007
3008
  	/* forget this pointer (for sanity check) */
  	batch->memcg = NULL;
  }
e767e0561   Daisuke Nishimura   memcg: fix deadlo...
3009
  #ifdef CONFIG_SWAP
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3010
  /*
e767e0561   Daisuke Nishimura   memcg: fix deadlo...
3011
   * called after __delete_from_swap_cache() and drop "page" account.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3012
3013
   * memcg information is recorded to swap_cgroup of "ent"
   */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
3014
3015
  void
  mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3016
3017
  {
  	struct mem_cgroup *memcg;
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
3018
3019
3020
3021
  	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
  
  	if (!swapout) /* this was a swap cache but the swap is unused ! */
  		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
0030f535a   Johannes Weiner   mm: memcg: fix co...
3022
  	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3023

f75ca9620   KAMEZAWA Hiroyuki   memcg: avoid css_...
3024
3025
3026
3027
3028
  	/*
  	 * record memcg information,  if swapout && memcg != NULL,
  	 * mem_cgroup_get() was called in uncharge().
  	 */
  	if (do_swap_account && swapout && memcg)
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
3029
  		swap_cgroup_record(ent, css_id(&memcg->css));
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3030
  }
e767e0561   Daisuke Nishimura   memcg: fix deadlo...
3031
  #endif
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3032

c255a4580   Andrew Morton   memcg: rename con...
3033
  #ifdef CONFIG_MEMCG_SWAP
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3034
3035
3036
3037
3038
  /*
   * called from swap_entry_free(). remove record in swap_cgroup and
   * uncharge "memsw" account.
   */
  void mem_cgroup_uncharge_swap(swp_entry_t ent)
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
3039
  {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3040
  	struct mem_cgroup *memcg;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
3041
  	unsigned short id;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3042
3043
3044
  
  	if (!do_swap_account)
  		return;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
3045
3046
3047
  	id = swap_cgroup_record(ent, 0);
  	rcu_read_lock();
  	memcg = mem_cgroup_lookup(id);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3048
  	if (memcg) {
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
3049
3050
3051
3052
  		/*
  		 * We uncharge this because swap is freed.
  		 * This memcg can be obsolete one. We avoid calling css_tryget
  		 */
0c3e73e84   Balbir Singh   memcg: improve re...
3053
  		if (!mem_cgroup_is_root(memcg))
4e649152c   KAMEZAWA Hiroyuki   memcg: some modif...
3054
  			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
0c3e73e84   Balbir Singh   memcg: improve re...
3055
  		mem_cgroup_swap_statistics(memcg, false);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3056
3057
  		mem_cgroup_put(memcg);
  	}
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
3058
  	rcu_read_unlock();
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
3059
  }
024914477   Daisuke Nishimura   memcg: move charg...
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
  
  /**
   * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   * @entry: swap entry to be moved
   * @from:  mem_cgroup which the entry is moved from
   * @to:  mem_cgroup which the entry is moved to
   *
   * It succeeds only when the swap_cgroup's record for this entry is the same
   * as the mem_cgroup's id of @from.
   *
   * Returns 0 on success, -EINVAL on failure.
   *
   * The caller must have charged to @to, IOW, called res_counter_charge() about
   * both res and memsw, and called css_get().
   */
  static int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
3076
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
3077
3078
3079
3080
3081
3082
3083
  {
  	unsigned short old_id, new_id;
  
  	old_id = css_id(&from->css);
  	new_id = css_id(&to->css);
  
  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
024914477   Daisuke Nishimura   memcg: move charg...
3084
  		mem_cgroup_swap_statistics(from, false);
483c30b51   Daisuke Nishimura   memcg: improve pe...
3085
  		mem_cgroup_swap_statistics(to, true);
024914477   Daisuke Nishimura   memcg: move charg...
3086
  		/*
483c30b51   Daisuke Nishimura   memcg: improve pe...
3087
3088
3089
3090
3091
3092
  		 * This function is only called from task migration context now.
  		 * It postpones res_counter and refcount handling till the end
  		 * of task migration(mem_cgroup_clear_mc()) for performance
  		 * improvement. But we cannot postpone mem_cgroup_get(to)
  		 * because if the process that has been moved to @to does
  		 * swap-in, the refcount of @to might be decreased to 0.
024914477   Daisuke Nishimura   memcg: move charg...
3093
  		 */
024914477   Daisuke Nishimura   memcg: move charg...
3094
  		mem_cgroup_get(to);
024914477   Daisuke Nishimura   memcg: move charg...
3095
3096
3097
3098
3099
3100
  		return 0;
  	}
  	return -EINVAL;
  }
  #else
  static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
3101
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
3102
3103
3104
  {
  	return -EINVAL;
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3105
  #endif
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
3106

ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
3107
  /*
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3108
3109
   * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
   * page belongs to.
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
3110
   */
0030f535a   Johannes Weiner   mm: memcg: fix co...
3111
3112
  void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
  				  struct mem_cgroup **memcgp)
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
3113
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
3114
  	struct mem_cgroup *memcg = NULL;
7ec99d621   Johannes Weiner   memcg: unify char...
3115
  	struct page_cgroup *pc;
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3116
  	enum charge_type ctype;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
3117

72835c86c   Johannes Weiner   mm: unify remaini...
3118
  	*memcgp = NULL;
56039efa1   KAMEZAWA Hiroyuki   memcg: fix ugly i...
3119

ec1685109   Andrea Arcangeli   thp: memcg compound
3120
  	VM_BUG_ON(PageTransHuge(page));
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
3121
  	if (mem_cgroup_disabled())
0030f535a   Johannes Weiner   mm: memcg: fix co...
3122
  		return;
4077960e2   Balbir Singh   memory controller...
3123

52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3124
3125
3126
  	pc = lookup_page_cgroup(page);
  	lock_page_cgroup(pc);
  	if (PageCgroupUsed(pc)) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
3127
3128
  		memcg = pc->mem_cgroup;
  		css_get(&memcg->css);
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
  		/*
  		 * At migrating an anonymous page, its mapcount goes down
  		 * to 0 and uncharge() will be called. But, even if it's fully
  		 * unmapped, migration may fail and this page has to be
  		 * charged again. We set MIGRATION flag here and delay uncharge
  		 * until end_migration() is called
  		 *
  		 * Corner Case Thinking
  		 * A)
  		 * When the old page was mapped as Anon and it's unmap-and-freed
  		 * while migration was ongoing.
  		 * If unmap finds the old page, uncharge() of it will be delayed
  		 * until end_migration(). If unmap finds a new page, it's
  		 * uncharged when it make mapcount to be 1->0. If unmap code
  		 * finds swap_migration_entry, the new page will not be mapped
  		 * and end_migration() will find it(mapcount==0).
  		 *
  		 * B)
  		 * When the old page was mapped but migraion fails, the kernel
  		 * remaps it. A charge for it is kept by MIGRATION flag even
  		 * if mapcount goes down to 0. We can do remap successfully
  		 * without charging it again.
  		 *
  		 * C)
  		 * The "old" page is under lock_page() until the end of
  		 * migration, so, the old page itself will not be swapped-out.
  		 * If the new page is swapped out before end_migraton, our
  		 * hook to usual swap-out path will catch the event.
  		 */
  		if (PageAnon(page))
  			SetPageCgroupMigration(pc);
e8589cc18   KAMEZAWA Hiroyuki   memcg: better mig...
3160
  	}
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3161
  	unlock_page_cgroup(pc);
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3162
3163
3164
3165
  	/*
  	 * If the page is not charged at this point,
  	 * we return here.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3166
  	if (!memcg)
0030f535a   Johannes Weiner   mm: memcg: fix co...
3167
  		return;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3168

72835c86c   Johannes Weiner   mm: unify remaini...
3169
  	*memcgp = memcg;
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3170
3171
3172
3173
3174
3175
  	/*
  	 * We charge new page before it's used/mapped. So, even if unlock_page()
  	 * is called before end_migration, we can catch all events on this new
  	 * page. In the case new page is migrated but not remapped, new page's
  	 * mapcount will be finally 0 and we call uncharge in end_migration().
  	 */
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3176
  	if (PageAnon(page))
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
3177
  		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3178
  	else
62ba7442c   Johannes Weiner   mm: memcg: remove...
3179
  		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
0030f535a   Johannes Weiner   mm: memcg: fix co...
3180
3181
3182
3183
3184
  	/*
  	 * The page is committed to the memcg, but it's not actually
  	 * charged to the res_counter since we plan on replacing the
  	 * old one and only one page is going to be left afterwards.
  	 */
ce587e65e   Johannes Weiner   mm: memcg: move p...
3185
  	__mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
3186
  }
8869b8f6e   Hugh Dickins   memcg: memcontrol...
3187

69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
3188
  /* remove redundant charge if migration failed*/
c0ff4b854   Raghavendra K T   memcg: rename mem...
3189
  void mem_cgroup_end_migration(struct mem_cgroup *memcg,
50de1dd96   Daisuke Nishimura   memcg: fix memory...
3190
  	struct page *oldpage, struct page *newpage, bool migration_ok)
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
3191
  {
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3192
  	struct page *used, *unused;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3193
  	struct page_cgroup *pc;
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
3194
  	bool anon;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3195

c0ff4b854   Raghavendra K T   memcg: rename mem...
3196
  	if (!memcg)
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3197
  		return;
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3198
  	/* blocks rmdir() */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3199
  	cgroup_exclude_rmdir(&memcg->css);
50de1dd96   Daisuke Nishimura   memcg: fix memory...
3200
  	if (!migration_ok) {
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3201
3202
  		used = oldpage;
  		unused = newpage;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3203
  	} else {
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3204
  		used = newpage;
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3205
3206
  		unused = oldpage;
  	}
0030f535a   Johannes Weiner   mm: memcg: fix co...
3207
  	anon = PageAnon(used);
7d188958b   Johannes Weiner   mm: memcg: only c...
3208
3209
3210
3211
  	__mem_cgroup_uncharge_common(unused,
  				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
  				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
  				     true);
0030f535a   Johannes Weiner   mm: memcg: fix co...
3212
  	css_put(&memcg->css);
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
3213
  	/*
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3214
3215
3216
  	 * We disallowed uncharge of pages under migration because mapcount
  	 * of the page goes down to zero, temporarly.
  	 * Clear the flag and check the page should be charged.
01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3217
  	 */
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3218
3219
3220
3221
  	pc = lookup_page_cgroup(oldpage);
  	lock_page_cgroup(pc);
  	ClearPageCgroupMigration(pc);
  	unlock_page_cgroup(pc);
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3222

01b1ae63c   KAMEZAWA Hiroyuki   memcg: simple mig...
3223
  	/*
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3224
3225
3226
3227
3228
3229
  	 * If a page is a file cache, radix-tree replacement is very atomic
  	 * and we can skip this check. When it was an Anon page, its mapcount
  	 * goes down to 0. But because we added MIGRATION flage, it's not
  	 * uncharged yet. There are several case but page->mapcount check
  	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
  	 * check. (see prepare_charge() also)
69029cd55   KAMEZAWA Hiroyuki   memcg: remove ref...
3230
  	 */
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
3231
  	if (anon)
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3232
  		mem_cgroup_uncharge_page(used);
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3233
  	/*
ac39cf8cb   akpm@linux-foundation.org   memcg: fix mis-ac...
3234
3235
  	 * At migration, we may charge account against cgroup which has no
  	 * tasks.
887032670   KAMEZAWA Hiroyuki   cgroup avoid perm...
3236
3237
3238
  	 * So, rmdir()->pre_destroy() can be called while we do this charge.
  	 * In that case, we need to call pre_destroy() again. check it here.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3239
  	cgroup_release_and_wakeup_rmdir(&memcg->css);
ae41be374   KAMEZAWA Hiroyuki   bugfix for memory...
3240
  }
78fb74669   Pavel Emelianov   Memory controller...
3241

ab936cbcd   KAMEZAWA Hiroyuki   memcg: add mem_cg...
3242
3243
3244
3245
3246
3247
3248
3249
  /*
   * At replace page cache, newpage is not under any memcg but it's on
   * LRU. So, this function doesn't touch res_counter but handles LRU
   * in correct way. Both pages are locked so we cannot race with uncharge.
   */
  void mem_cgroup_replace_page_cache(struct page *oldpage,
  				  struct page *newpage)
  {
bde05d1cc   Hugh Dickins   shmem: replace pa...
3250
  	struct mem_cgroup *memcg = NULL;
ab936cbcd   KAMEZAWA Hiroyuki   memcg: add mem_cg...
3251
  	struct page_cgroup *pc;
ab936cbcd   KAMEZAWA Hiroyuki   memcg: add mem_cg...
3252
  	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
ab936cbcd   KAMEZAWA Hiroyuki   memcg: add mem_cg...
3253
3254
3255
3256
3257
3258
3259
  
  	if (mem_cgroup_disabled())
  		return;
  
  	pc = lookup_page_cgroup(oldpage);
  	/* fix accounting on old pages */
  	lock_page_cgroup(pc);
bde05d1cc   Hugh Dickins   shmem: replace pa...
3260
3261
3262
3263
3264
  	if (PageCgroupUsed(pc)) {
  		memcg = pc->mem_cgroup;
  		mem_cgroup_charge_statistics(memcg, false, -1);
  		ClearPageCgroupUsed(pc);
  	}
ab936cbcd   KAMEZAWA Hiroyuki   memcg: add mem_cg...
3265
  	unlock_page_cgroup(pc);
bde05d1cc   Hugh Dickins   shmem: replace pa...
3266
3267
3268
3269
3270
3271
  	/*
  	 * When called from shmem_replace_page(), in some cases the
  	 * oldpage has already been charged, and in some cases not.
  	 */
  	if (!memcg)
  		return;
ab936cbcd   KAMEZAWA Hiroyuki   memcg: add mem_cg...
3272
3273
3274
3275
3276
  	/*
  	 * Even if newpage->mapping was NULL before starting replacement,
  	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
  	 * LRU while we overwrite pc->mem_cgroup.
  	 */
ce587e65e   Johannes Weiner   mm: memcg: move p...
3277
  	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
ab936cbcd   KAMEZAWA Hiroyuki   memcg: add mem_cg...
3278
  }
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
3279
3280
3281
3282
3283
3284
  #ifdef CONFIG_DEBUG_VM
  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
  {
  	struct page_cgroup *pc;
  
  	pc = lookup_page_cgroup(page);
cfa449461   Johannes Weiner   mm: memcg: lookup...
3285
3286
3287
3288
3289
  	/*
  	 * Can be NULL while feeding pages into the page allocator for
  	 * the first time, i.e. during boot or memory hotplug;
  	 * or when mem_cgroup_disabled().
  	 */
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
  	if (likely(pc) && PageCgroupUsed(pc))
  		return pc;
  	return NULL;
  }
  
  bool mem_cgroup_bad_page_check(struct page *page)
  {
  	if (mem_cgroup_disabled())
  		return false;
  
  	return lookup_page_cgroup_used(page) != NULL;
  }
  
  void mem_cgroup_print_bad_page(struct page *page)
  {
  	struct page_cgroup *pc;
  
  	pc = lookup_page_cgroup_used(page);
  	if (pc) {
90b3feaec   Hugh Dickins   memcg: fix mem_cg...
3309
3310
  		printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p
  ",
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
3311
  		       pc, pc->flags, pc->mem_cgroup);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
3312
3313
3314
  	}
  }
  #endif
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3315
  static DEFINE_MUTEX(set_limit_mutex);
d38d2a758   KOSAKI Motohiro   mm: make mem_cgro...
3316
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3317
  				unsigned long long val)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3318
  {
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3319
  	int retry_count;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3320
  	u64 memswlimit, memlimit;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3321
  	int ret = 0;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3322
3323
  	int children = mem_cgroup_count_children(memcg);
  	u64 curusage, oldusage;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3324
  	int enlarge;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3325
3326
3327
3328
3329
3330
3331
3332
3333
  
  	/*
  	 * For keeping hierarchical_reclaim simple, how long we should retry
  	 * is depends on callers. We set our retry-count to be function
  	 * of # of children which we should visit in this loop.
  	 */
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
  
  	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3334

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3335
  	enlarge = 0;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3336
  	while (retry_count) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3337
3338
3339
3340
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3341
3342
3343
  		/*
  		 * Rather than hide all in some function, I do this in
  		 * open coded manner. You see what this really does.
aaad153e3   Wanpeng Li   mm/memcg: mem_cgr...
3344
  		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3345
3346
3347
3348
3349
3350
  		 */
  		mutex_lock(&set_limit_mutex);
  		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  		if (memswlimit < val) {
  			ret = -EINVAL;
  			mutex_unlock(&set_limit_mutex);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3351
3352
  			break;
  		}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3353
3354
3355
3356
  
  		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  		if (memlimit < val)
  			enlarge = 1;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3357
  		ret = res_counter_set_limit(&memcg->res, val);
22a668d7c   KAMEZAWA Hiroyuki   memcg: fix behavi...
3358
3359
3360
3361
3362
3363
  		if (!ret) {
  			if (memswlimit == val)
  				memcg->memsw_is_minimum = true;
  			else
  				memcg->memsw_is_minimum = false;
  		}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3364
3365
3366
3367
  		mutex_unlock(&set_limit_mutex);
  
  		if (!ret)
  			break;
5660048cc   Johannes Weiner   mm: move memcg hi...
3368
3369
  		mem_cgroup_reclaim(memcg, GFP_KERNEL,
  				   MEM_CGROUP_RECLAIM_SHRINK);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3370
3371
3372
3373
3374
3375
  		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
  		/* Usage is reduced ? */
    		if (curusage >= oldusage)
  			retry_count--;
  		else
  			oldusage = curusage;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3376
  	}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3377
3378
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
14797e236   KOSAKI Motohiro   memcg: add inacti...
3379

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3380
3381
  	return ret;
  }
338c84310   Li Zefan   memcg: remove som...
3382
3383
  static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
  					unsigned long long val)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3384
  {
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3385
  	int retry_count;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3386
  	u64 memlimit, memswlimit, oldusage, curusage;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3387
3388
  	int children = mem_cgroup_count_children(memcg);
  	int ret = -EBUSY;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3389
  	int enlarge = 0;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3390

81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3391
3392
3393
  	/* see mem_cgroup_resize_res_limit */
   	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
  	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3394
3395
3396
3397
3398
3399
3400
3401
  	while (retry_count) {
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
  		/*
  		 * Rather than hide all in some function, I do this in
  		 * open coded manner. You see what this really does.
aaad153e3   Wanpeng Li   mm/memcg: mem_cgr...
3402
  		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3403
3404
3405
3406
3407
3408
3409
3410
  		 */
  		mutex_lock(&set_limit_mutex);
  		memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  		if (memlimit > val) {
  			ret = -EINVAL;
  			mutex_unlock(&set_limit_mutex);
  			break;
  		}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3411
3412
3413
  		memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  		if (memswlimit < val)
  			enlarge = 1;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3414
  		ret = res_counter_set_limit(&memcg->memsw, val);
22a668d7c   KAMEZAWA Hiroyuki   memcg: fix behavi...
3415
3416
3417
3418
3419
3420
  		if (!ret) {
  			if (memlimit == val)
  				memcg->memsw_is_minimum = true;
  			else
  				memcg->memsw_is_minimum = false;
  		}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3421
3422
3423
3424
  		mutex_unlock(&set_limit_mutex);
  
  		if (!ret)
  			break;
5660048cc   Johannes Weiner   mm: move memcg hi...
3425
3426
3427
  		mem_cgroup_reclaim(memcg, GFP_KERNEL,
  				   MEM_CGROUP_RECLAIM_NOSWAP |
  				   MEM_CGROUP_RECLAIM_SHRINK);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3428
  		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3429
  		/* Usage is reduced ? */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3430
  		if (curusage >= oldusage)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3431
  			retry_count--;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3432
3433
  		else
  			oldusage = curusage;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3434
  	}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3435
3436
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3437
3438
  	return ret;
  }
4e4169535   Balbir Singh   memory controller...
3439
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
0ae5e89c6   Ying Han   memcg: count the ...
3440
3441
  					    gfp_t gfp_mask,
  					    unsigned long *total_scanned)
4e4169535   Balbir Singh   memory controller...
3442
3443
3444
3445
3446
3447
  {
  	unsigned long nr_reclaimed = 0;
  	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
  	unsigned long reclaimed;
  	int loop = 0;
  	struct mem_cgroup_tree_per_zone *mctz;
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
3448
  	unsigned long long excess;
0ae5e89c6   Ying Han   memcg: count the ...
3449
  	unsigned long nr_scanned;
4e4169535   Balbir Singh   memory controller...
3450
3451
3452
  
  	if (order > 0)
  		return 0;
00918b6ab   KOSAKI Motohiro   memcg: remove nid...
3453
  	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4e4169535   Balbir Singh   memory controller...
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
  	/*
  	 * This loop can run a while, specially if mem_cgroup's continuously
  	 * keep exceeding their soft limit and putting the system under
  	 * pressure
  	 */
  	do {
  		if (next_mz)
  			mz = next_mz;
  		else
  			mz = mem_cgroup_largest_soft_limit_node(mctz);
  		if (!mz)
  			break;
0ae5e89c6   Ying Han   memcg: count the ...
3466
  		nr_scanned = 0;
d79154bb5   Hugh Dickins   memcg: replace me...
3467
  		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
5660048cc   Johannes Weiner   mm: move memcg hi...
3468
  						    gfp_mask, &nr_scanned);
4e4169535   Balbir Singh   memory controller...
3469
  		nr_reclaimed += reclaimed;
0ae5e89c6   Ying Han   memcg: count the ...
3470
  		*total_scanned += nr_scanned;
4e4169535   Balbir Singh   memory controller...
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
  		spin_lock(&mctz->lock);
  
  		/*
  		 * If we failed to reclaim anything from this memory cgroup
  		 * it is time to move on to the next cgroup
  		 */
  		next_mz = NULL;
  		if (!reclaimed) {
  			do {
  				/*
  				 * Loop until we find yet another one.
  				 *
  				 * By the time we get the soft_limit lock
  				 * again, someone might have aded the
  				 * group back on the RB tree. Iterate to
  				 * make sure we get a different mem.
  				 * mem_cgroup_largest_soft_limit_node returns
  				 * NULL if no other cgroup is present on
  				 * the tree
  				 */
  				next_mz =
  				__mem_cgroup_largest_soft_limit_node(mctz);
39cc98f1f   Michal Hocko   memcg: remove poi...
3493
  				if (next_mz == mz)
d79154bb5   Hugh Dickins   memcg: replace me...
3494
  					css_put(&next_mz->memcg->css);
39cc98f1f   Michal Hocko   memcg: remove poi...
3495
  				else /* next_mz == NULL or other memcg */
4e4169535   Balbir Singh   memory controller...
3496
3497
3498
  					break;
  			} while (1);
  		}
d79154bb5   Hugh Dickins   memcg: replace me...
3499
3500
  		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
  		excess = res_counter_soft_limit_excess(&mz->memcg->res);
4e4169535   Balbir Singh   memory controller...
3501
3502
3503
3504
3505
3506
3507
3508
  		/*
  		 * One school of thought says that we should not add
  		 * back the node to the tree if reclaim returns 0.
  		 * But our reclaim could return 0, simply because due
  		 * to priority we are exposing a smaller subset of
  		 * memory to reclaim from. Consider this as a longer
  		 * term TODO.
  		 */
ef8745c1e   KAMEZAWA Hiroyuki   memcg: reduce che...
3509
  		/* If excess == 0, no tree ops */
d79154bb5   Hugh Dickins   memcg: replace me...
3510
  		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4e4169535   Balbir Singh   memory controller...
3511
  		spin_unlock(&mctz->lock);
d79154bb5   Hugh Dickins   memcg: replace me...
3512
  		css_put(&mz->memcg->css);
4e4169535   Balbir Singh   memory controller...
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
  		loop++;
  		/*
  		 * Could not reclaim anything and there are no more
  		 * mem cgroups to try or we seem to be looping without
  		 * reclaiming anything.
  		 */
  		if (!nr_reclaimed &&
  			(next_mz == NULL ||
  			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
  			break;
  	} while (!nr_reclaimed);
  	if (next_mz)
d79154bb5   Hugh Dickins   memcg: replace me...
3525
  		css_put(&next_mz->memcg->css);
4e4169535   Balbir Singh   memory controller...
3526
3527
  	return nr_reclaimed;
  }
c9b0ed514   KAMEZAWA Hiroyuki   memcg: helper fun...
3528
  /*
3c935d189   KAMEZAWA Hiroyuki   memcg: make mem_c...
3529
3530
3531
3532
   * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
   * reclaim the pages page themselves - it just removes the page_cgroups.
   * Returns true if some page_cgroups were not freed, indicating that the caller
   * must retry this operation.
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3533
   */
3c935d189   KAMEZAWA Hiroyuki   memcg: make mem_c...
3534
  static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3535
  				int node, int zid, enum lru_list lru)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3536
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3537
  	struct mem_cgroup_per_zone *mz;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3538
  	unsigned long flags, loop;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
3539
  	struct list_head *list;
925b7673c   Johannes Weiner   mm: make per-memc...
3540
3541
  	struct page *busy;
  	struct zone *zone;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
3542

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3543
  	zone = &NODE_DATA(node)->node_zones[zid];
c0ff4b854   Raghavendra K T   memcg: rename mem...
3544
  	mz = mem_cgroup_zoneinfo(memcg, node, zid);
6290df545   Johannes Weiner   mm: collect LRU l...
3545
  	list = &mz->lruvec.lists[lru];
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3546

1eb492725   Hugh Dickins   memcg: lru_size i...
3547
  	loop = mz->lru_size[lru];
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3548
3549
3550
3551
  	/* give some margin against EBUSY etc...*/
  	loop += 256;
  	busy = NULL;
  	while (loop--) {
925b7673c   Johannes Weiner   mm: make per-memc...
3552
  		struct page_cgroup *pc;
5564e88ba   Johannes Weiner   memcg: condense p...
3553
  		struct page *page;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3554
  		spin_lock_irqsave(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3555
  		if (list_empty(list)) {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3556
  			spin_unlock_irqrestore(&zone->lru_lock, flags);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3557
  			break;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3558
  		}
925b7673c   Johannes Weiner   mm: make per-memc...
3559
3560
3561
  		page = list_entry(list->prev, struct page, lru);
  		if (busy == page) {
  			list_move(&page->lru, list);
648bcc771   Thiago Farina   mm/memcontrol.c: ...
3562
  			busy = NULL;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3563
  			spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3564
3565
  			continue;
  		}
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3566
  		spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3567

925b7673c   Johannes Weiner   mm: make per-memc...
3568
  		pc = lookup_page_cgroup(page);
5564e88ba   Johannes Weiner   memcg: condense p...
3569

3c935d189   KAMEZAWA Hiroyuki   memcg: make mem_c...
3570
  		if (mem_cgroup_move_parent(page, pc, memcg)) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3571
  			/* found lock contention or "pc" is obsolete. */
925b7673c   Johannes Weiner   mm: make per-memc...
3572
  			busy = page;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3573
3574
3575
  			cond_resched();
  		} else
  			busy = NULL;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3576
  	}
3c935d189   KAMEZAWA Hiroyuki   memcg: make mem_c...
3577
  	return !list_empty(list);
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3578
3579
3580
3581
3582
3583
  }
  
  /*
   * make mem_cgroup's charge to be 0 if there is no task.
   * This enables deleting this mem_cgroup.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3584
  static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3585
  {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3586
3587
3588
  	int ret;
  	int node, zid, shrink;
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3589
  	struct cgroup *cgrp = memcg->css.cgroup;
8869b8f6e   Hugh Dickins   memcg: memcontrol...
3590

c0ff4b854   Raghavendra K T   memcg: rename mem...
3591
  	css_get(&memcg->css);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3592
3593
  
  	shrink = 0;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3594
3595
3596
  	/* should free all ? */
  	if (free_all)
  		goto try_to_free;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3597
  move_account:
fce664775   Daisuke Nishimura   memcg: ensure lis...
3598
  	do {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3599
  		ret = -EBUSY;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3600
3601
  		if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
  			goto out;
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3602
3603
  		/* This is for making all *used* pages to be on LRU. */
  		lru_add_drain_all();
c0ff4b854   Raghavendra K T   memcg: rename mem...
3604
  		drain_all_stock_sync(memcg);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3605
  		ret = 0;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3606
  		mem_cgroup_start_move(memcg);
299b4eaa3   KAMEZAWA Hiroyuki   memcg: NULL point...
3607
  		for_each_node_state(node, N_HIGH_MEMORY) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3608
  			for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
f156ab933   Hugh Dickins   memcg: enum lru_l...
3609
3610
  				enum lru_list lru;
  				for_each_lru(lru) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
3611
  					ret = mem_cgroup_force_empty_list(memcg,
f156ab933   Hugh Dickins   memcg: enum lru_l...
3612
  							node, zid, lru);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3613
3614
3615
  					if (ret)
  						break;
  				}
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3616
  			}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3617
3618
3619
  			if (ret)
  				break;
  		}
c0ff4b854   Raghavendra K T   memcg: rename mem...
3620
3621
  		mem_cgroup_end_move(memcg);
  		memcg_oom_recover(memcg);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3622
  		cond_resched();
fce664775   Daisuke Nishimura   memcg: ensure lis...
3623
  	/* "ret" should also be checked to ensure all lists are empty. */
569530fb1   Glauber Costa   memcg: do not ope...
3624
  	} while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3625
  out:
c0ff4b854   Raghavendra K T   memcg: rename mem...
3626
  	css_put(&memcg->css);
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3627
  	return ret;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3628
3629
  
  try_to_free:
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3630
3631
  	/* returns EBUSY if there is a task or if we come here twice. */
  	if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3632
3633
3634
  		ret = -EBUSY;
  		goto out;
  	}
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3635
3636
  	/* we call try-to-free pages for make this cgroup empty */
  	lru_add_drain_all();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3637
3638
  	/* try to free all pages in this cgroup */
  	shrink = 1;
569530fb1   Glauber Costa   memcg: do not ope...
3639
  	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3640
  		int progress;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3641
3642
3643
3644
3645
  
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			goto out;
  		}
c0ff4b854   Raghavendra K T   memcg: rename mem...
3646
  		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
185efc0f9   Johannes Weiner   memcg: Revert "me...
3647
  						false);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3648
  		if (!progress) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3649
  			nr_retries--;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3650
  			/* maybe some writeback is necessary */
8aa7e847d   Jens Axboe   Fix congestion_wa...
3651
  			congestion_wait(BLK_RW_ASYNC, HZ/10);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3652
  		}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3653
3654
  
  	}
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3655
  	lru_add_drain();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3656
  	/* try move_account...there may be some *locked* pages. */
fce664775   Daisuke Nishimura   memcg: ensure lis...
3657
  	goto move_account;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3658
  }
6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
3659
  static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3660
3661
3662
  {
  	return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
  }
18f59ea7d   Balbir Singh   memcg: memory cgr...
3663
3664
3665
3666
3667
3668
3669
3670
3671
  static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
  {
  	return mem_cgroup_from_cont(cont)->use_hierarchy;
  }
  
  static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
  					u64 val)
  {
  	int retval = 0;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3672
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3673
  	struct cgroup *parent = cont->parent;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3674
  	struct mem_cgroup *parent_memcg = NULL;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3675
3676
  
  	if (parent)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3677
  		parent_memcg = mem_cgroup_from_cont(parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3678
3679
  
  	cgroup_lock();
567fb435b   Glauber Costa   memcg: fix bad be...
3680
3681
3682
  
  	if (memcg->use_hierarchy == val)
  		goto out;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3683
  	/*
af901ca18   André Goddard Rosa   tree-wide: fix as...
3684
  	 * If parent's use_hierarchy is set, we can't make any modifications
18f59ea7d   Balbir Singh   memcg: memory cgr...
3685
3686
3687
3688
3689
3690
  	 * in the child subtrees. If it is unset, then the change can
  	 * occur, provided the current cgroup has no children.
  	 *
  	 * For the root cgroup, parent_mem is NULL, we allow value to be
  	 * set if there are no children.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3691
  	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
18f59ea7d   Balbir Singh   memcg: memory cgr...
3692
3693
  				(val == 1 || val == 0)) {
  		if (list_empty(&cont->children))
c0ff4b854   Raghavendra K T   memcg: rename mem...
3694
  			memcg->use_hierarchy = val;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3695
3696
3697
3698
  		else
  			retval = -EBUSY;
  	} else
  		retval = -EINVAL;
567fb435b   Glauber Costa   memcg: fix bad be...
3699
3700
  
  out:
18f59ea7d   Balbir Singh   memcg: memory cgr...
3701
3702
3703
3704
  	cgroup_unlock();
  
  	return retval;
  }
0c3e73e84   Balbir Singh   memcg: improve re...
3705

c0ff4b854   Raghavendra K T   memcg: rename mem...
3706
  static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
7a159cc9d   Johannes Weiner   memcg: use native...
3707
  					       enum mem_cgroup_stat_index idx)
0c3e73e84   Balbir Singh   memcg: improve re...
3708
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3709
  	struct mem_cgroup *iter;
7a159cc9d   Johannes Weiner   memcg: use native...
3710
  	long val = 0;
0c3e73e84   Balbir Singh   memcg: improve re...
3711

7a159cc9d   Johannes Weiner   memcg: use native...
3712
  	/* Per-cpu values can be negative, use a signed accumulator */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3713
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3714
3715
3716
3717
3718
  		val += mem_cgroup_read_stat(iter, idx);
  
  	if (val < 0) /* race ? */
  		val = 0;
  	return val;
0c3e73e84   Balbir Singh   memcg: improve re...
3719
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3720
  static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
104f39284   Kirill A. Shutemov   memcg: extract me...
3721
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3722
  	u64 val;
104f39284   Kirill A. Shutemov   memcg: extract me...
3723

c0ff4b854   Raghavendra K T   memcg: rename mem...
3724
  	if (!mem_cgroup_is_root(memcg)) {
104f39284   Kirill A. Shutemov   memcg: extract me...
3725
  		if (!swap)
65c64ce8e   Glauber Costa   Partial revert "B...
3726
  			return res_counter_read_u64(&memcg->res, RES_USAGE);
104f39284   Kirill A. Shutemov   memcg: extract me...
3727
  		else
65c64ce8e   Glauber Costa   Partial revert "B...
3728
  			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
104f39284   Kirill A. Shutemov   memcg: extract me...
3729
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
3730
3731
  	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
  	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
104f39284   Kirill A. Shutemov   memcg: extract me...
3732

7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3733
  	if (swap)
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
3734
  		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
104f39284   Kirill A. Shutemov   memcg: extract me...
3735
3736
3737
  
  	return val << PAGE_SHIFT;
  }
af36f906c   Tejun Heo   memcg: always cre...
3738
3739
3740
  static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
  			       struct file *file, char __user *buf,
  			       size_t nbytes, loff_t *ppos)
8cdea7c05   Balbir Singh   Memory controller...
3741
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
3742
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
af36f906c   Tejun Heo   memcg: always cre...
3743
  	char str[64];
104f39284   Kirill A. Shutemov   memcg: extract me...
3744
  	u64 val;
af36f906c   Tejun Heo   memcg: always cre...
3745
  	int type, name, len;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3746
3747
3748
  
  	type = MEMFILE_TYPE(cft->private);
  	name = MEMFILE_ATTR(cft->private);
af36f906c   Tejun Heo   memcg: always cre...
3749
3750
3751
  
  	if (!do_swap_account && type == _MEMSWAP)
  		return -EOPNOTSUPP;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3752
3753
  	switch (type) {
  	case _MEM:
104f39284   Kirill A. Shutemov   memcg: extract me...
3754
  		if (name == RES_USAGE)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3755
  			val = mem_cgroup_usage(memcg, false);
104f39284   Kirill A. Shutemov   memcg: extract me...
3756
  		else
c0ff4b854   Raghavendra K T   memcg: rename mem...
3757
  			val = res_counter_read_u64(&memcg->res, name);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3758
3759
  		break;
  	case _MEMSWAP:
104f39284   Kirill A. Shutemov   memcg: extract me...
3760
  		if (name == RES_USAGE)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3761
  			val = mem_cgroup_usage(memcg, true);
104f39284   Kirill A. Shutemov   memcg: extract me...
3762
  		else
c0ff4b854   Raghavendra K T   memcg: rename mem...
3763
  			val = res_counter_read_u64(&memcg->memsw, name);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3764
3765
3766
  		break;
  	default:
  		BUG();
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3767
  	}
af36f906c   Tejun Heo   memcg: always cre...
3768
3769
3770
3771
  
  	len = scnprintf(str, sizeof(str), "%llu
  ", (unsigned long long)val);
  	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
8cdea7c05   Balbir Singh   Memory controller...
3772
  }
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3773
3774
3775
3776
  /*
   * The user of this function is...
   * RES_LIMIT.
   */
856c13aa1   Paul Menage   cgroup files: con...
3777
3778
  static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
  			    const char *buffer)
8cdea7c05   Balbir Singh   Memory controller...
3779
  {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3780
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3781
  	int type, name;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3782
3783
  	unsigned long long val;
  	int ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3784
3785
  	type = MEMFILE_TYPE(cft->private);
  	name = MEMFILE_ATTR(cft->private);
af36f906c   Tejun Heo   memcg: always cre...
3786
3787
3788
  
  	if (!do_swap_account && type == _MEMSWAP)
  		return -EOPNOTSUPP;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3789
  	switch (name) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3790
  	case RES_LIMIT:
4b3bde4c9   Balbir Singh   memcg: remove the...
3791
3792
3793
3794
  		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
  			ret = -EINVAL;
  			break;
  		}
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3795
3796
  		/* This function does all necessary parse...reuse it */
  		ret = res_counter_memparse_write_strategy(buffer, &val);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3797
3798
3799
  		if (ret)
  			break;
  		if (type == _MEM)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3800
  			ret = mem_cgroup_resize_limit(memcg, val);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3801
3802
  		else
  			ret = mem_cgroup_resize_memsw_limit(memcg, val);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3803
  		break;
296c81d89   Balbir Singh   memory controller...
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
  	case RES_SOFT_LIMIT:
  		ret = res_counter_memparse_write_strategy(buffer, &val);
  		if (ret)
  			break;
  		/*
  		 * For memsw, soft limits are hard to implement in terms
  		 * of semantics, for now, we support soft limits for
  		 * control without swap
  		 */
  		if (type == _MEM)
  			ret = res_counter_set_soft_limit(&memcg->res, val);
  		else
  			ret = -EINVAL;
  		break;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3818
3819
3820
3821
3822
  	default:
  		ret = -EINVAL; /* should be BUG() ? */
  		break;
  	}
  	return ret;
8cdea7c05   Balbir Singh   Memory controller...
3823
  }
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
  static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
  		unsigned long long *mem_limit, unsigned long long *memsw_limit)
  {
  	struct cgroup *cgroup;
  	unsigned long long min_limit, min_memsw_limit, tmp;
  
  	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  	cgroup = memcg->css.cgroup;
  	if (!memcg->use_hierarchy)
  		goto out;
  
  	while (cgroup->parent) {
  		cgroup = cgroup->parent;
  		memcg = mem_cgroup_from_cont(cgroup);
  		if (!memcg->use_hierarchy)
  			break;
  		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
  		min_limit = min(min_limit, tmp);
  		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  		min_memsw_limit = min(min_memsw_limit, tmp);
  	}
  out:
  	*mem_limit = min_limit;
  	*memsw_limit = min_memsw_limit;
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3849
  }
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3850
  static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
c84872e16   Pavel Emelyanov   memcgroup: add th...
3851
  {
af36f906c   Tejun Heo   memcg: always cre...
3852
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3853
  	int type, name;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3854

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3855
3856
  	type = MEMFILE_TYPE(event);
  	name = MEMFILE_ATTR(event);
af36f906c   Tejun Heo   memcg: always cre...
3857
3858
3859
  
  	if (!do_swap_account && type == _MEMSWAP)
  		return -EOPNOTSUPP;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3860
  	switch (name) {
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3861
  	case RES_MAX_USAGE:
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3862
  		if (type == _MEM)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3863
  			res_counter_reset_max(&memcg->res);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3864
  		else
c0ff4b854   Raghavendra K T   memcg: rename mem...
3865
  			res_counter_reset_max(&memcg->memsw);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3866
3867
  		break;
  	case RES_FAILCNT:
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3868
  		if (type == _MEM)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3869
  			res_counter_reset_failcnt(&memcg->res);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3870
  		else
c0ff4b854   Raghavendra K T   memcg: rename mem...
3871
  			res_counter_reset_failcnt(&memcg->memsw);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3872
3873
  		break;
  	}
f64c3f549   Balbir Singh   memory controller...
3874

85cc59db1   Pavel Emelyanov   memcgroup: use tr...
3875
  	return 0;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3876
  }
7dc74be03   Daisuke Nishimura   memcg: add interf...
3877
3878
3879
3880
3881
  static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
  					struct cftype *cft)
  {
  	return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
  }
024914477   Daisuke Nishimura   memcg: move charg...
3882
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
3883
3884
3885
  static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
  					struct cftype *cft, u64 val)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
3886
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
7dc74be03   Daisuke Nishimura   memcg: add interf...
3887
3888
3889
3890
3891
3892
3893
3894
3895
  
  	if (val >= (1 << NR_MOVE_TYPE))
  		return -EINVAL;
  	/*
  	 * We check this value several times in both in can_attach() and
  	 * attach(), so we need cgroup lock to prevent this value from being
  	 * inconsistent.
  	 */
  	cgroup_lock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
3896
  	memcg->move_charge_at_immigrate = val;
7dc74be03   Daisuke Nishimura   memcg: add interf...
3897
3898
3899
3900
  	cgroup_unlock();
  
  	return 0;
  }
024914477   Daisuke Nishimura   memcg: move charg...
3901
3902
3903
3904
3905
3906
3907
  #else
  static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
  					struct cftype *cft, u64 val)
  {
  	return -ENOSYS;
  }
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
3908

406eb0c9b   Ying Han   memcg: add memory...
3909
  #ifdef CONFIG_NUMA
ab2158848   Wanpeng Li   memcg: rename mem...
3910
  static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
fada52ca0   Johannes Weiner   mm: memcg: conver...
3911
  				      struct seq_file *m)
406eb0c9b   Ying Han   memcg: add memory...
3912
3913
3914
3915
  {
  	int nid;
  	unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
  	unsigned long node_nr;
d79154bb5   Hugh Dickins   memcg: replace me...
3916
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
406eb0c9b   Ying Han   memcg: add memory...
3917

d79154bb5   Hugh Dickins   memcg: replace me...
3918
  	total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
406eb0c9b   Ying Han   memcg: add memory...
3919
3920
  	seq_printf(m, "total=%lu", total_nr);
  	for_each_node_state(nid, N_HIGH_MEMORY) {
d79154bb5   Hugh Dickins   memcg: replace me...
3921
  		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
406eb0c9b   Ying Han   memcg: add memory...
3922
3923
3924
3925
  		seq_printf(m, " N%d=%lu", nid, node_nr);
  	}
  	seq_putc(m, '
  ');
d79154bb5   Hugh Dickins   memcg: replace me...
3926
  	file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
406eb0c9b   Ying Han   memcg: add memory...
3927
3928
  	seq_printf(m, "file=%lu", file_nr);
  	for_each_node_state(nid, N_HIGH_MEMORY) {
d79154bb5   Hugh Dickins   memcg: replace me...
3929
  		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
3930
  				LRU_ALL_FILE);
406eb0c9b   Ying Han   memcg: add memory...
3931
3932
3933
3934
  		seq_printf(m, " N%d=%lu", nid, node_nr);
  	}
  	seq_putc(m, '
  ');
d79154bb5   Hugh Dickins   memcg: replace me...
3935
  	anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
406eb0c9b   Ying Han   memcg: add memory...
3936
3937
  	seq_printf(m, "anon=%lu", anon_nr);
  	for_each_node_state(nid, N_HIGH_MEMORY) {
d79154bb5   Hugh Dickins   memcg: replace me...
3938
  		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
3939
  				LRU_ALL_ANON);
406eb0c9b   Ying Han   memcg: add memory...
3940
3941
3942
3943
  		seq_printf(m, " N%d=%lu", nid, node_nr);
  	}
  	seq_putc(m, '
  ');
d79154bb5   Hugh Dickins   memcg: replace me...
3944
  	unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
406eb0c9b   Ying Han   memcg: add memory...
3945
3946
  	seq_printf(m, "unevictable=%lu", unevictable_nr);
  	for_each_node_state(nid, N_HIGH_MEMORY) {
d79154bb5   Hugh Dickins   memcg: replace me...
3947
  		node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
3948
  				BIT(LRU_UNEVICTABLE));
406eb0c9b   Ying Han   memcg: add memory...
3949
3950
3951
3952
3953
3954
3955
  		seq_printf(m, " N%d=%lu", nid, node_nr);
  	}
  	seq_putc(m, '
  ');
  	return 0;
  }
  #endif /* CONFIG_NUMA */
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
  static const char * const mem_cgroup_lru_names[] = {
  	"inactive_anon",
  	"active_anon",
  	"inactive_file",
  	"active_file",
  	"unevictable",
  };
  
  static inline void mem_cgroup_lru_names_not_uptodate(void)
  {
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
  }
ab2158848   Wanpeng Li   memcg: rename mem...
3968
  static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
3969
  				 struct seq_file *m)
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3970
  {
d79154bb5   Hugh Dickins   memcg: replace me...
3971
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3972
3973
  	struct mem_cgroup *mi;
  	unsigned int i;
406eb0c9b   Ying Han   memcg: add memory...
3974

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3975
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
3976
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3977
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3978
3979
3980
  		seq_printf(m, "%s %ld
  ", mem_cgroup_stat_names[i],
  			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3981
  	}
7b854121e   Lee Schermerhorn   Unevictable LRU P...
3982

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3983
3984
3985
3986
3987
3988
3989
3990
3991
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_events_names[i],
  			   mem_cgroup_read_events(memcg, i));
  
  	for (i = 0; i < NR_LRU_LISTS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_lru_names[i],
  			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3992
  	/* Hierarchical information */
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3993
3994
  	{
  		unsigned long long limit, memsw_limit;
d79154bb5   Hugh Dickins   memcg: replace me...
3995
  		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
3996
3997
  		seq_printf(m, "hierarchical_memory_limit %llu
  ", limit);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3998
  		if (do_swap_account)
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
3999
4000
4001
  			seq_printf(m, "hierarchical_memsw_limit %llu
  ",
  				   memsw_limit);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4002
  	}
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4003

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4004
4005
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
  		long long val = 0;
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
4006
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
4007
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
  		seq_printf(m, "total_%s %lld
  ", mem_cgroup_stat_names[i], val);
  	}
  
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_events(mi, i);
  		seq_printf(m, "total_%s %llu
  ",
  			   mem_cgroup_events_names[i], val);
  	}
  
  	for (i = 0; i < NR_LRU_LISTS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
  		seq_printf(m, "total_%s %llu
  ", mem_cgroup_lru_names[i], val);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
4031
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
4032

7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4033
  #ifdef CONFIG_DEBUG_VM
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4034
4035
4036
  	{
  		int nid, zid;
  		struct mem_cgroup_per_zone *mz;
89abfab13   Hugh Dickins   mm/memcg: move re...
4037
  		struct zone_reclaim_stat *rstat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4038
4039
4040
4041
4042
  		unsigned long recent_rotated[2] = {0, 0};
  		unsigned long recent_scanned[2] = {0, 0};
  
  		for_each_online_node(nid)
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
d79154bb5   Hugh Dickins   memcg: replace me...
4043
  				mz = mem_cgroup_zoneinfo(memcg, nid, zid);
89abfab13   Hugh Dickins   mm/memcg: move re...
4044
  				rstat = &mz->lruvec.reclaim_stat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4045

89abfab13   Hugh Dickins   mm/memcg: move re...
4046
4047
4048
4049
  				recent_rotated[0] += rstat->recent_rotated[0];
  				recent_rotated[1] += rstat->recent_rotated[1];
  				recent_scanned[0] += rstat->recent_scanned[0];
  				recent_scanned[1] += rstat->recent_scanned[1];
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4050
  			}
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
4051
4052
4053
4054
4055
4056
4057
4058
  		seq_printf(m, "recent_rotated_anon %lu
  ", recent_rotated[0]);
  		seq_printf(m, "recent_rotated_file %lu
  ", recent_rotated[1]);
  		seq_printf(m, "recent_scanned_anon %lu
  ", recent_scanned[0]);
  		seq_printf(m, "recent_scanned_file %lu
  ", recent_scanned[1]);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4059
4060
  	}
  #endif
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4061
4062
  	return 0;
  }
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4063
4064
4065
  static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
4066
  	return mem_cgroup_swappiness(memcg);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4067
4068
4069
4070
4071
4072
4073
  }
  
  static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
  				       u64 val)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  	struct mem_cgroup *parent;
068b38c1f   Li Zefan   memcg: fix a race...
4074

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4075
4076
4077
4078
4079
4080
4081
  	if (val > 100)
  		return -EINVAL;
  
  	if (cgrp->parent == NULL)
  		return -EINVAL;
  
  	parent = mem_cgroup_from_cont(cgrp->parent);
068b38c1f   Li Zefan   memcg: fix a race...
4082
4083
  
  	cgroup_lock();
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4084
4085
  	/* If under hierarchy, only empty-root can set this value */
  	if ((parent->use_hierarchy) ||
068b38c1f   Li Zefan   memcg: fix a race...
4086
4087
  	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
  		cgroup_unlock();
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4088
  		return -EINVAL;
068b38c1f   Li Zefan   memcg: fix a race...
4089
  	}
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4090

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4091
  	memcg->swappiness = val;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4092

068b38c1f   Li Zefan   memcg: fix a race...
4093
  	cgroup_unlock();
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4094
4095
  	return 0;
  }
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4096
4097
4098
4099
4100
4101
4102
4103
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
  	struct mem_cgroup_threshold_ary *t;
  	u64 usage;
  	int i;
  
  	rcu_read_lock();
  	if (!swap)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4104
  		t = rcu_dereference(memcg->thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4105
  	else
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4106
  		t = rcu_dereference(memcg->memsw_thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4107
4108
4109
4110
4111
4112
4113
  
  	if (!t)
  		goto unlock;
  
  	usage = mem_cgroup_usage(memcg, swap);
  
  	/*
748dad36d   Sha Zhengju   memcg: make thres...
4114
  	 * current_threshold points to threshold just below or equal to usage.
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4115
4116
4117
  	 * If it's not true, a threshold was crossed after last
  	 * call of __mem_cgroup_threshold().
  	 */
5407a5625   Phil Carmody   mm: remove unnece...
4118
  	i = t->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
  
  	/*
  	 * Iterate backward over array of thresholds starting from
  	 * current_threshold and check if a threshold is crossed.
  	 * If none of thresholds below usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* i = current_threshold + 1 */
  	i++;
  
  	/*
  	 * Iterate forward over array of thresholds starting from
  	 * current_threshold+1 and check if a threshold is crossed.
  	 * If none of thresholds above usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* Update current_threshold */
5407a5625   Phil Carmody   mm: remove unnece...
4142
  	t->current_threshold = i - 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4143
4144
4145
4146
4147
4148
  unlock:
  	rcu_read_unlock();
  }
  
  static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  {
ad4ca5f4b   Kirill A. Shutemov   memcg: fix thresh...
4149
4150
4151
4152
4153
4154
4155
  	while (memcg) {
  		__mem_cgroup_threshold(memcg, false);
  		if (do_swap_account)
  			__mem_cgroup_threshold(memcg, true);
  
  		memcg = parent_mem_cgroup(memcg);
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4156
4157
4158
4159
4160
4161
4162
4163
4164
  }
  
  static int compare_thresholds(const void *a, const void *b)
  {
  	const struct mem_cgroup_threshold *_a = a;
  	const struct mem_cgroup_threshold *_b = b;
  
  	return _a->threshold - _b->threshold;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4165
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4166
4167
  {
  	struct mem_cgroup_eventfd_list *ev;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4168
  	list_for_each_entry(ev, &memcg->oom_notify, list)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4169
4170
4171
  		eventfd_signal(ev->eventfd, 1);
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4172
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4173
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
4174
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4175
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
4176
  		mem_cgroup_oom_notify_cb(iter);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4177
4178
4179
4180
  }
  
  static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
  	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4181
4182
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4183
4184
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4185
4186
  	int type = MEMFILE_TYPE(cft->private);
  	u64 threshold, usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4187
  	int i, size, ret;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4188
4189
4190
4191
4192
4193
  
  	ret = res_counter_memparse_write_strategy(args, &threshold);
  	if (ret)
  		return ret;
  
  	mutex_lock(&memcg->thresholds_lock);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4194

2e72b6347   Kirill A. Shutemov   memcg: implement ...
4195
  	if (type == _MEM)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4196
  		thresholds = &memcg->thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4197
  	else if (type == _MEMSWAP)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4198
  		thresholds = &memcg->memsw_thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4199
4200
4201
4202
4203
4204
  	else
  		BUG();
  
  	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
  
  	/* Check if a threshold crossed before adding a new one */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4205
  	if (thresholds->primary)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4206
  		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4207
  	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4208
4209
  
  	/* Allocate memory for new array of thresholds */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4210
  	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4211
  			GFP_KERNEL);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4212
  	if (!new) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4213
4214
4215
  		ret = -ENOMEM;
  		goto unlock;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4216
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4217
4218
  
  	/* Copy thresholds (if any) to new array */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4219
4220
  	if (thresholds->primary) {
  		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4221
  				sizeof(struct mem_cgroup_threshold));
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4222
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4223
  	/* Add new threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4224
4225
  	new->entries[size - 1].eventfd = eventfd;
  	new->entries[size - 1].threshold = threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4226
4227
  
  	/* Sort thresholds. Registering of new threshold isn't time-critical */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4228
  	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4229
4230
4231
  			compare_thresholds, NULL);
  
  	/* Find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4232
  	new->current_threshold = -1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4233
  	for (i = 0; i < size; i++) {
748dad36d   Sha Zhengju   memcg: make thres...
4234
  		if (new->entries[i].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4235
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4236
4237
  			 * new->current_threshold will not be used until
  			 * rcu_assign_pointer(), so it's safe to increment
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4238
4239
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4240
  			++new->current_threshold;
748dad36d   Sha Zhengju   memcg: make thres...
4241
4242
  		} else
  			break;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4243
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4244
4245
4246
4247
4248
  	/* Free old spare buffer and save old primary buffer as spare */
  	kfree(thresholds->spare);
  	thresholds->spare = thresholds->primary;
  
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4249

907860ed3   Kirill A. Shutemov   cgroups: make cft...
4250
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4251
  	synchronize_rcu();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4252
4253
4254
4255
4256
  unlock:
  	mutex_unlock(&memcg->thresholds_lock);
  
  	return ret;
  }
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4257
  static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4258
  	struct cftype *cft, struct eventfd_ctx *eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4259
4260
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4261
4262
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4263
4264
  	int type = MEMFILE_TYPE(cft->private);
  	u64 usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4265
  	int i, j, size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4266
4267
4268
  
  	mutex_lock(&memcg->thresholds_lock);
  	if (type == _MEM)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4269
  		thresholds = &memcg->thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4270
  	else if (type == _MEMSWAP)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4271
  		thresholds = &memcg->memsw_thresholds;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4272
4273
  	else
  		BUG();
371528cae   Anton Vorontsov   mm: memcg: Correc...
4274
4275
  	if (!thresholds->primary)
  		goto unlock;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4276
4277
4278
4279
4280
4281
  	usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
  
  	/* Check if a threshold crossed before removing */
  	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
  
  	/* Calculate new number of threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4282
4283
4284
  	size = 0;
  	for (i = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd != eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4285
4286
  			size++;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4287
  	new = thresholds->spare;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4288

2e72b6347   Kirill A. Shutemov   memcg: implement ...
4289
4290
  	/* Set thresholds array to NULL if we don't have thresholds */
  	if (!size) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4291
4292
  		kfree(new);
  		new = NULL;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4293
  		goto swap_buffers;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4294
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4295
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4296
4297
  
  	/* Copy thresholds and find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4298
4299
4300
  	new->current_threshold = -1;
  	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd == eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4301
  			continue;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4302
  		new->entries[j] = thresholds->primary->entries[i];
748dad36d   Sha Zhengju   memcg: make thres...
4303
  		if (new->entries[j].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4304
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4305
  			 * new->current_threshold will not be used
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4306
4307
4308
  			 * until rcu_assign_pointer(), so it's safe to increment
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4309
  			++new->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4310
4311
4312
  		}
  		j++;
  	}
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4313
  swap_buffers:
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4314
4315
  	/* Swap primary and spare array */
  	thresholds->spare = thresholds->primary;
8c7577637   Sha Zhengju   memcg: free spare...
4316
4317
4318
4319
4320
  	/* If all events are unregistered, free the spare array */
  	if (!new) {
  		kfree(thresholds->spare);
  		thresholds->spare = NULL;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4321
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4322

907860ed3   Kirill A. Shutemov   cgroups: make cft...
4323
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4324
  	synchronize_rcu();
371528cae   Anton Vorontsov   mm: memcg: Correc...
4325
  unlock:
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4326
  	mutex_unlock(&memcg->thresholds_lock);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4327
  }
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4328

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
  static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
  	struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
  	struct mem_cgroup_eventfd_list *event;
  	int type = MEMFILE_TYPE(cft->private);
  
  	BUG_ON(type != _OOM_TYPE);
  	event = kmalloc(sizeof(*event),	GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
1af8efe96   Michal Hocko   memcg: change mem...
4340
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4341
4342
4343
4344
4345
  
  	event->eventfd = eventfd;
  	list_add(&event->list, &memcg->oom_notify);
  
  	/* already in OOM ? */
79dfdaccd   Michal Hocko   memcg: make oom_l...
4346
  	if (atomic_read(&memcg->under_oom))
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4347
  		eventfd_signal(eventfd, 1);
1af8efe96   Michal Hocko   memcg: change mem...
4348
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4349
4350
4351
  
  	return 0;
  }
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4352
  static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4353
4354
  	struct cftype *cft, struct eventfd_ctx *eventfd)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4355
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4356
4357
4358
4359
  	struct mem_cgroup_eventfd_list *ev, *tmp;
  	int type = MEMFILE_TYPE(cft->private);
  
  	BUG_ON(type != _OOM_TYPE);
1af8efe96   Michal Hocko   memcg: change mem...
4360
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4361

c0ff4b854   Raghavendra K T   memcg: rename mem...
4362
  	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4363
4364
4365
4366
4367
  		if (ev->eventfd == eventfd) {
  			list_del(&ev->list);
  			kfree(ev);
  		}
  	}
1af8efe96   Michal Hocko   memcg: change mem...
4368
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4369
  }
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4370
4371
4372
  static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
  	struct cftype *cft,  struct cgroup_map_cb *cb)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4373
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4374

c0ff4b854   Raghavendra K T   memcg: rename mem...
4375
  	cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4376

c0ff4b854   Raghavendra K T   memcg: rename mem...
4377
  	if (atomic_read(&memcg->under_oom))
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4378
4379
4380
4381
4382
  		cb->fill(cb, "under_oom", 1);
  	else
  		cb->fill(cb, "under_oom", 0);
  	return 0;
  }
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4383
4384
4385
  static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
  	struct cftype *cft, u64 val)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4386
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
  	struct mem_cgroup *parent;
  
  	/* cannot set to root cgroup and only 0 and 1 are allowed */
  	if (!cgrp->parent || !((val == 0) || (val == 1)))
  		return -EINVAL;
  
  	parent = mem_cgroup_from_cont(cgrp->parent);
  
  	cgroup_lock();
  	/* oom-kill-disable is a flag for subhierarchy. */
  	if ((parent->use_hierarchy) ||
c0ff4b854   Raghavendra K T   memcg: rename mem...
4398
  	    (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4399
4400
4401
  		cgroup_unlock();
  		return -EINVAL;
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
4402
  	memcg->oom_kill_disable = val;
4d845ebf4   KAMEZAWA Hiroyuki   memcg: fix wake u...
4403
  	if (!val)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4404
  		memcg_oom_recover(memcg);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4405
4406
4407
  	cgroup_unlock();
  	return 0;
  }
c255a4580   Andrew Morton   memcg: rename con...
4408
  #ifdef CONFIG_MEMCG_KMEM
cbe128e34   Glauber Costa   cgroup: get rid o...
4409
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
4410
  {
1d62e4365   Glauber Costa   cgroup: pass stru...
4411
  	return mem_cgroup_sockets_init(memcg, ss);
e5671dfae   Glauber Costa   Basic kernel memo...
4412
  };
1d62e4365   Glauber Costa   cgroup: pass stru...
4413
  static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
d1a4c0b37   Glauber Costa   tcp memory pressu...
4414
  {
1d62e4365   Glauber Costa   cgroup: pass stru...
4415
  	mem_cgroup_sockets_destroy(memcg);
d1a4c0b37   Glauber Costa   tcp memory pressu...
4416
  }
e5671dfae   Glauber Costa   Basic kernel memo...
4417
  #else
cbe128e34   Glauber Costa   cgroup: get rid o...
4418
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
4419
4420
4421
  {
  	return 0;
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
4422

1d62e4365   Glauber Costa   cgroup: pass stru...
4423
  static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
d1a4c0b37   Glauber Costa   tcp memory pressu...
4424
4425
  {
  }
e5671dfae   Glauber Costa   Basic kernel memo...
4426
  #endif
8cdea7c05   Balbir Singh   Memory controller...
4427
4428
  static struct cftype mem_cgroup_files[] = {
  	{
0eea10301   Balbir Singh   Memory controller...
4429
  		.name = "usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4430
  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
af36f906c   Tejun Heo   memcg: always cre...
4431
  		.read = mem_cgroup_read,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4432
4433
  		.register_event = mem_cgroup_usage_register_event,
  		.unregister_event = mem_cgroup_usage_unregister_event,
8cdea7c05   Balbir Singh   Memory controller...
4434
4435
  	},
  	{
c84872e16   Pavel Emelyanov   memcgroup: add th...
4436
  		.name = "max_usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4437
  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
4438
  		.trigger = mem_cgroup_reset,
af36f906c   Tejun Heo   memcg: always cre...
4439
  		.read = mem_cgroup_read,
c84872e16   Pavel Emelyanov   memcgroup: add th...
4440
4441
  	},
  	{
0eea10301   Balbir Singh   Memory controller...
4442
  		.name = "limit_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4443
  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
856c13aa1   Paul Menage   cgroup files: con...
4444
  		.write_string = mem_cgroup_write,
af36f906c   Tejun Heo   memcg: always cre...
4445
  		.read = mem_cgroup_read,
8cdea7c05   Balbir Singh   Memory controller...
4446
4447
  	},
  	{
296c81d89   Balbir Singh   memory controller...
4448
4449
4450
  		.name = "soft_limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
  		.write_string = mem_cgroup_write,
af36f906c   Tejun Heo   memcg: always cre...
4451
  		.read = mem_cgroup_read,
296c81d89   Balbir Singh   memory controller...
4452
4453
  	},
  	{
8cdea7c05   Balbir Singh   Memory controller...
4454
  		.name = "failcnt",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4455
  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
4456
  		.trigger = mem_cgroup_reset,
af36f906c   Tejun Heo   memcg: always cre...
4457
  		.read = mem_cgroup_read,
8cdea7c05   Balbir Singh   Memory controller...
4458
  	},
8697d3319   Balbir Singh   Memory controller...
4459
  	{
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4460
  		.name = "stat",
ab2158848   Wanpeng Li   memcg: rename mem...
4461
  		.read_seq_string = memcg_stat_show,
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4462
  	},
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4463
4464
4465
4466
  	{
  		.name = "force_empty",
  		.trigger = mem_cgroup_force_empty_write,
  	},
18f59ea7d   Balbir Singh   memcg: memory cgr...
4467
4468
4469
4470
4471
  	{
  		.name = "use_hierarchy",
  		.write_u64 = mem_cgroup_hierarchy_write,
  		.read_u64 = mem_cgroup_hierarchy_read,
  	},
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4472
4473
4474
4475
4476
  	{
  		.name = "swappiness",
  		.read_u64 = mem_cgroup_swappiness_read,
  		.write_u64 = mem_cgroup_swappiness_write,
  	},
7dc74be03   Daisuke Nishimura   memcg: add interf...
4477
4478
4479
4480
4481
  	{
  		.name = "move_charge_at_immigrate",
  		.read_u64 = mem_cgroup_move_charge_read,
  		.write_u64 = mem_cgroup_move_charge_write,
  	},
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4482
4483
  	{
  		.name = "oom_control",
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4484
4485
  		.read_map = mem_cgroup_oom_control_read,
  		.write_u64 = mem_cgroup_oom_control_write,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4486
4487
4488
4489
  		.register_event = mem_cgroup_oom_register_event,
  		.unregister_event = mem_cgroup_oom_unregister_event,
  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  	},
406eb0c9b   Ying Han   memcg: add memory...
4490
4491
4492
  #ifdef CONFIG_NUMA
  	{
  		.name = "numa_stat",
ab2158848   Wanpeng Li   memcg: rename mem...
4493
  		.read_seq_string = memcg_numa_stat_show,
406eb0c9b   Ying Han   memcg: add memory...
4494
4495
  	},
  #endif
c255a4580   Andrew Morton   memcg: rename con...
4496
  #ifdef CONFIG_MEMCG_SWAP
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4497
4498
4499
  	{
  		.name = "memsw.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
af36f906c   Tejun Heo   memcg: always cre...
4500
  		.read = mem_cgroup_read,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4501
4502
  		.register_event = mem_cgroup_usage_register_event,
  		.unregister_event = mem_cgroup_usage_unregister_event,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4503
4504
4505
4506
4507
  	},
  	{
  		.name = "memsw.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
  		.trigger = mem_cgroup_reset,
af36f906c   Tejun Heo   memcg: always cre...
4508
  		.read = mem_cgroup_read,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4509
4510
4511
4512
4513
  	},
  	{
  		.name = "memsw.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
  		.write_string = mem_cgroup_write,
af36f906c   Tejun Heo   memcg: always cre...
4514
  		.read = mem_cgroup_read,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4515
4516
4517
4518
4519
  	},
  	{
  		.name = "memsw.failcnt",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
  		.trigger = mem_cgroup_reset,
af36f906c   Tejun Heo   memcg: always cre...
4520
  		.read = mem_cgroup_read,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4521
  	},
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4522
  #endif
6bc103498   Tejun Heo   cgroup: convert m...
4523
  	{ },	/* terminate */
af36f906c   Tejun Heo   memcg: always cre...
4524
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4525

c0ff4b854   Raghavendra K T   memcg: rename mem...
4526
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4527
4528
  {
  	struct mem_cgroup_per_node *pn;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4529
  	struct mem_cgroup_per_zone *mz;
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
4530
  	int zone, tmp = node;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4531
4532
4533
4534
4535
4536
4537
4538
  	/*
  	 * This routine is called against possible nodes.
  	 * But it's BUG to call kmalloc() against offline node.
  	 *
  	 * TODO: this routine can waste much memory for nodes which will
  	 *       never be onlined. It's better to use memory hotplug callback
  	 *       function.
  	 */
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
4539
4540
  	if (!node_state(node, N_NORMAL_MEMORY))
  		tmp = -1;
17295c88a   Jesper Juhl   memcg: use [kv]za...
4541
  	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4542
4543
  	if (!pn)
  		return 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4544

1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4545
4546
  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  		mz = &pn->zoneinfo[zone];
7f5e86c2c   Konstantin Khlebnikov   mm: add link from...
4547
  		lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
f64c3f549   Balbir Singh   memory controller...
4548
  		mz->usage_in_excess = 0;
4e4169535   Balbir Singh   memory controller...
4549
  		mz->on_tree = false;
d79154bb5   Hugh Dickins   memcg: replace me...
4550
  		mz->memcg = memcg;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4551
  	}
0a619e587   Igor Mammedov   memcg: do not exp...
4552
  	memcg->info.nodeinfo[node] = pn;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4553
4554
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4555
  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4556
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4557
  	kfree(memcg->info.nodeinfo[node]);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4558
  }
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4559
4560
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
4561
  	struct mem_cgroup *memcg;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
4562
  	int size = sizeof(struct mem_cgroup);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4563

c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
4564
  	/* Can be very big if MAX_NUMNODES is very big */
c8dad2bb6   Jan Blunck   memcg: reduce siz...
4565
  	if (size < PAGE_SIZE)
d79154bb5   Hugh Dickins   memcg: replace me...
4566
  		memcg = kzalloc(size, GFP_KERNEL);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4567
  	else
d79154bb5   Hugh Dickins   memcg: replace me...
4568
  		memcg = vzalloc(size);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4569

d79154bb5   Hugh Dickins   memcg: replace me...
4570
  	if (!memcg)
e7bbcdf37   Dan Carpenter   memcontrol: fix p...
4571
  		return NULL;
d79154bb5   Hugh Dickins   memcg: replace me...
4572
4573
  	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
  	if (!memcg->stat)
d2e61b8dc   Dan Carpenter   memcg: null deref...
4574
  		goto out_free;
d79154bb5   Hugh Dickins   memcg: replace me...
4575
4576
  	spin_lock_init(&memcg->pcp_counter_lock);
  	return memcg;
d2e61b8dc   Dan Carpenter   memcg: null deref...
4577
4578
4579
  
  out_free:
  	if (size < PAGE_SIZE)
d79154bb5   Hugh Dickins   memcg: replace me...
4580
  		kfree(memcg);
d2e61b8dc   Dan Carpenter   memcg: null deref...
4581
  	else
d79154bb5   Hugh Dickins   memcg: replace me...
4582
  		vfree(memcg);
d2e61b8dc   Dan Carpenter   memcg: null deref...
4583
  	return NULL;
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4584
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4585
  /*
3afe36b1f   Glauber Costa   memcg: always fre...
4586
   * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
59927fb98   Hugh Dickins   memcg: free mem_c...
4587
4588
4589
   * but in process context.  The work_freeing structure is overlaid
   * on the rcu_freeing structure, which itself is overlaid on memsw.
   */
3afe36b1f   Glauber Costa   memcg: always fre...
4590
  static void free_work(struct work_struct *work)
59927fb98   Hugh Dickins   memcg: free mem_c...
4591
4592
  {
  	struct mem_cgroup *memcg;
3afe36b1f   Glauber Costa   memcg: always fre...
4593
  	int size = sizeof(struct mem_cgroup);
59927fb98   Hugh Dickins   memcg: free mem_c...
4594
4595
  
  	memcg = container_of(work, struct mem_cgroup, work_freeing);
3f1346193   Glauber Costa   memcg: decrement ...
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
  	/*
  	 * We need to make sure that (at least for now), the jump label
  	 * destruction code runs outside of the cgroup lock. This is because
  	 * get_online_cpus(), which is called from the static_branch update,
  	 * can't be called inside the cgroup_lock. cpusets are the ones
  	 * enforcing this dependency, so if they ever change, we might as well.
  	 *
  	 * schedule_work() will guarantee this happens. Be careful if you need
  	 * to move this code around, and make sure it is outside
  	 * the cgroup_lock.
  	 */
  	disarm_sock_keys(memcg);
3afe36b1f   Glauber Costa   memcg: always fre...
4608
4609
4610
4611
  	if (size < PAGE_SIZE)
  		kfree(memcg);
  	else
  		vfree(memcg);
59927fb98   Hugh Dickins   memcg: free mem_c...
4612
  }
3afe36b1f   Glauber Costa   memcg: always fre...
4613
4614
  
  static void free_rcu(struct rcu_head *rcu_head)
59927fb98   Hugh Dickins   memcg: free mem_c...
4615
4616
4617
4618
  {
  	struct mem_cgroup *memcg;
  
  	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
3afe36b1f   Glauber Costa   memcg: always fre...
4619
  	INIT_WORK(&memcg->work_freeing, free_work);
59927fb98   Hugh Dickins   memcg: free mem_c...
4620
4621
4622
4623
  	schedule_work(&memcg->work_freeing);
  }
  
  /*
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4624
4625
4626
4627
4628
4629
4630
   * At destroying mem_cgroup, references from swap_cgroup can remain.
   * (scanning all at force_empty is too costly...)
   *
   * Instead of clearing all references at force_empty, we remember
   * the number of reference from swap_cgroup and free mem_cgroup when
   * it goes down to 0.
   *
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4631
4632
   * Removal of cgroup itself succeeds regardless of refs from swap.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
4633
  static void __mem_cgroup_free(struct mem_cgroup *memcg)
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4634
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
4635
  	int node;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4636
4637
  	mem_cgroup_remove_from_trees(memcg);
  	free_css_id(&mem_cgroup_subsys, &memcg->css);
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4638

3ed28fa10   Bob Liu   memcg: cleanup fo...
4639
  	for_each_node(node)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4640
  		free_mem_cgroup_per_zone_info(memcg, node);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
4641

c0ff4b854   Raghavendra K T   memcg: rename mem...
4642
  	free_percpu(memcg->stat);
3afe36b1f   Glauber Costa   memcg: always fre...
4643
  	call_rcu(&memcg->rcu_freeing, free_rcu);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4644
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4645
  static void mem_cgroup_get(struct mem_cgroup *memcg)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4646
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4647
  	atomic_inc(&memcg->refcnt);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4648
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4649
  static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4650
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4651
4652
4653
  	if (atomic_sub_and_test(count, &memcg->refcnt)) {
  		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
  		__mem_cgroup_free(memcg);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4654
4655
4656
  		if (parent)
  			mem_cgroup_put(parent);
  	}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4657
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4658
  static void mem_cgroup_put(struct mem_cgroup *memcg)
483c30b51   Daisuke Nishimura   memcg: improve pe...
4659
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4660
  	__mem_cgroup_put(memcg, 1);
483c30b51   Daisuke Nishimura   memcg: improve pe...
4661
  }
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4662
4663
4664
  /*
   * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
   */
e1aab161e   Glauber Costa   socket: initial c...
4665
  struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4666
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4667
  	if (!memcg->res.parent)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4668
  		return NULL;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4669
  	return mem_cgroup_from_res_counter(memcg->res.parent, res);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4670
  }
e1aab161e   Glauber Costa   socket: initial c...
4671
  EXPORT_SYMBOL(parent_mem_cgroup);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4672

c255a4580   Andrew Morton   memcg: rename con...
4673
  #ifdef CONFIG_MEMCG_SWAP
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4674
4675
  static void __init enable_swap_cgroup(void)
  {
f8d665422   Hirokazu Takahashi   memcg: add mem_cg...
4676
  	if (!mem_cgroup_disabled() && really_do_swap_account)
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4677
4678
4679
4680
4681
4682
4683
  		do_swap_account = 1;
  }
  #else
  static void __init enable_swap_cgroup(void)
  {
  }
  #endif
f64c3f549   Balbir Singh   memory controller...
4684
4685
4686
4687
4688
  static int mem_cgroup_soft_limit_tree_init(void)
  {
  	struct mem_cgroup_tree_per_node *rtpn;
  	struct mem_cgroup_tree_per_zone *rtpz;
  	int tmp, node, zone;
3ed28fa10   Bob Liu   memcg: cleanup fo...
4689
  	for_each_node(node) {
f64c3f549   Balbir Singh   memory controller...
4690
4691
4692
4693
4694
  		tmp = node;
  		if (!node_state(node, N_NORMAL_MEMORY))
  			tmp = -1;
  		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
  		if (!rtpn)
c3cecc683   Michal Hocko   memcg: free entri...
4695
  			goto err_cleanup;
f64c3f549   Balbir Singh   memory controller...
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
  
  		soft_limit_tree.rb_tree_per_node[node] = rtpn;
  
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  			rtpz = &rtpn->rb_tree_per_zone[zone];
  			rtpz->rb_root = RB_ROOT;
  			spin_lock_init(&rtpz->lock);
  		}
  	}
  	return 0;
c3cecc683   Michal Hocko   memcg: free entri...
4706
4707
  
  err_cleanup:
3ed28fa10   Bob Liu   memcg: cleanup fo...
4708
  	for_each_node(node) {
c3cecc683   Michal Hocko   memcg: free entri...
4709
4710
4711
4712
4713
4714
  		if (!soft_limit_tree.rb_tree_per_node[node])
  			break;
  		kfree(soft_limit_tree.rb_tree_per_node[node]);
  		soft_limit_tree.rb_tree_per_node[node] = NULL;
  	}
  	return 1;
f64c3f549   Balbir Singh   memory controller...
4715
  }
0eb253e22   Li Zefan   memcg: fix sectio...
4716
  static struct cgroup_subsys_state * __ref
761b3ef50   Li Zefan   cgroup: remove cg...
4717
  mem_cgroup_create(struct cgroup *cont)
8cdea7c05   Balbir Singh   Memory controller...
4718
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4719
  	struct mem_cgroup *memcg, *parent;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4720
  	long error = -ENOMEM;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4721
  	int node;
8cdea7c05   Balbir Singh   Memory controller...
4722

c0ff4b854   Raghavendra K T   memcg: rename mem...
4723
4724
  	memcg = mem_cgroup_alloc();
  	if (!memcg)
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4725
  		return ERR_PTR(error);
78fb74669   Pavel Emelianov   Memory controller...
4726

3ed28fa10   Bob Liu   memcg: cleanup fo...
4727
  	for_each_node(node)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4728
  		if (alloc_mem_cgroup_per_zone_info(memcg, node))
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4729
  			goto free_out;
f64c3f549   Balbir Singh   memory controller...
4730

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4731
  	/* root ? */
28dbc4b6a   Balbir Singh   memcg: memory cgr...
4732
  	if (cont->parent == NULL) {
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
4733
  		int cpu;
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4734
  		enable_swap_cgroup();
28dbc4b6a   Balbir Singh   memcg: memory cgr...
4735
  		parent = NULL;
f64c3f549   Balbir Singh   memory controller...
4736
4737
  		if (mem_cgroup_soft_limit_tree_init())
  			goto free_out;
a41c58a66   Hillf Danton   memcg: keep root ...
4738
  		root_mem_cgroup = memcg;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
4739
4740
4741
4742
4743
  		for_each_possible_cpu(cpu) {
  			struct memcg_stock_pcp *stock =
  						&per_cpu(memcg_stock, cpu);
  			INIT_WORK(&stock->work, drain_local_stock);
  		}
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
4744
  		hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
18f59ea7d   Balbir Singh   memcg: memory cgr...
4745
  	} else {
28dbc4b6a   Balbir Singh   memcg: memory cgr...
4746
  		parent = mem_cgroup_from_cont(cont->parent);
c0ff4b854   Raghavendra K T   memcg: rename mem...
4747
4748
  		memcg->use_hierarchy = parent->use_hierarchy;
  		memcg->oom_kill_disable = parent->oom_kill_disable;
18f59ea7d   Balbir Singh   memcg: memory cgr...
4749
  	}
28dbc4b6a   Balbir Singh   memcg: memory cgr...
4750

18f59ea7d   Balbir Singh   memcg: memory cgr...
4751
  	if (parent && parent->use_hierarchy) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4752
4753
  		res_counter_init(&memcg->res, &parent->res);
  		res_counter_init(&memcg->memsw, &parent->memsw);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4754
4755
4756
4757
4758
4759
4760
  		/*
  		 * We increment refcnt of the parent to ensure that we can
  		 * safely access it on res_counter_charge/uncharge.
  		 * This refcnt will be decremented when freeing this
  		 * mem_cgroup(see mem_cgroup_put).
  		 */
  		mem_cgroup_get(parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
4761
  	} else {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4762
4763
  		res_counter_init(&memcg->res, NULL);
  		res_counter_init(&memcg->memsw, NULL);
18f59ea7d   Balbir Singh   memcg: memory cgr...
4764
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
4765
4766
  	memcg->last_scanned_node = MAX_NUMNODES;
  	INIT_LIST_HEAD(&memcg->oom_notify);
6d61ef409   Balbir Singh   memcg: memory cgr...
4767

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4768
  	if (parent)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4769
4770
4771
4772
  		memcg->swappiness = mem_cgroup_swappiness(parent);
  	atomic_set(&memcg->refcnt, 1);
  	memcg->move_charge_at_immigrate = 0;
  	mutex_init(&memcg->thresholds_lock);
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
4773
  	spin_lock_init(&memcg->move_lock);
cbe128e34   Glauber Costa   cgroup: get rid o...
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
  
  	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
  	if (error) {
  		/*
  		 * We call put now because our (and parent's) refcnts
  		 * are already in place. mem_cgroup_put() will internally
  		 * call __mem_cgroup_free, so return directly
  		 */
  		mem_cgroup_put(memcg);
  		return ERR_PTR(error);
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
4785
  	return &memcg->css;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4786
  free_out:
c0ff4b854   Raghavendra K T   memcg: rename mem...
4787
  	__mem_cgroup_free(memcg);
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4788
  	return ERR_PTR(error);
8cdea7c05   Balbir Singh   Memory controller...
4789
  }
761b3ef50   Li Zefan   cgroup: remove cg...
4790
  static int mem_cgroup_pre_destroy(struct cgroup *cont)
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4791
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4792
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4793

c0ff4b854   Raghavendra K T   memcg: rename mem...
4794
  	return mem_cgroup_force_empty(memcg, false);
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4795
  }
761b3ef50   Li Zefan   cgroup: remove cg...
4796
  static void mem_cgroup_destroy(struct cgroup *cont)
8cdea7c05   Balbir Singh   Memory controller...
4797
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4798
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
c268e9946   Daisuke Nishimura   memcg: fix hierar...
4799

1d62e4365   Glauber Costa   cgroup: pass stru...
4800
  	kmem_cgroup_destroy(memcg);
d1a4c0b37   Glauber Costa   tcp memory pressu...
4801

c0ff4b854   Raghavendra K T   memcg: rename mem...
4802
  	mem_cgroup_put(memcg);
8cdea7c05   Balbir Singh   Memory controller...
4803
  }
024914477   Daisuke Nishimura   memcg: move charg...
4804
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
4805
  /* Handlers for move charge at task migration. */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4806
4807
  #define PRECHARGE_COUNT_AT_ONCE	256
  static int mem_cgroup_do_precharge(unsigned long count)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4808
  {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4809
4810
  	int ret = 0;
  	int batch_count = PRECHARGE_COUNT_AT_ONCE;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4811
  	struct mem_cgroup *memcg = mc.to;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4812

c0ff4b854   Raghavendra K T   memcg: rename mem...
4813
  	if (mem_cgroup_is_root(memcg)) {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4814
4815
4816
4817
4818
4819
4820
4821
  		mc.precharge += count;
  		/* we don't need css_get for root */
  		return ret;
  	}
  	/* try to charge at once */
  	if (count > 1) {
  		struct res_counter *dummy;
  		/*
c0ff4b854   Raghavendra K T   memcg: rename mem...
4822
  		 * "memcg" cannot be under rmdir() because we've already checked
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4823
4824
4825
4826
  		 * by cgroup_lock_live_cgroup() that it is not removed and we
  		 * are still under the same cgroup_mutex. So we can postpone
  		 * css_get().
  		 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
4827
  		if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4828
  			goto one_by_one;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4829
  		if (do_swap_account && res_counter_charge(&memcg->memsw,
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4830
  						PAGE_SIZE * count, &dummy)) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
4831
  			res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4832
4833
4834
  			goto one_by_one;
  		}
  		mc.precharge += count;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
  		return ret;
  	}
  one_by_one:
  	/* fall back to one by one charge */
  	while (count--) {
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
  		if (!batch_count--) {
  			batch_count = PRECHARGE_COUNT_AT_ONCE;
  			cond_resched();
  		}
c0ff4b854   Raghavendra K T   memcg: rename mem...
4848
4849
  		ret = __mem_cgroup_try_charge(NULL,
  					GFP_KERNEL, 1, &memcg, false);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4850
  		if (ret)
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4851
  			/* mem_cgroup_clear_mc() will do uncharge later */
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4852
  			return ret;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4853
4854
  		mc.precharge++;
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4855
4856
4857
4858
  	return ret;
  }
  
  /**
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4859
   * get_mctgt_type - get target type of moving charge
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4860
4861
4862
   * @vma: the vma the pte to be checked belongs
   * @addr: the address corresponding to the pte to be checked
   * @ptent: the pte to be checked
024914477   Daisuke Nishimura   memcg: move charg...
4863
   * @target: the pointer the target page or swap ent will be stored(can be NULL)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4864
4865
4866
4867
4868
4869
   *
   * Returns
   *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
   *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
   *     move charge. if @target is not NULL, the page is stored in target->page
   *     with extra refcnt got(Callers should handle it).
024914477   Daisuke Nishimura   memcg: move charg...
4870
4871
4872
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4873
4874
4875
   *
   * Called with pte lock held.
   */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4876
4877
  union mc_target {
  	struct page	*page;
024914477   Daisuke Nishimura   memcg: move charg...
4878
  	swp_entry_t	ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4879
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4880
  enum mc_target_type {
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4881
  	MC_TARGET_NONE = 0,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4882
  	MC_TARGET_PAGE,
024914477   Daisuke Nishimura   memcg: move charg...
4883
  	MC_TARGET_SWAP,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4884
  };
90254a658   Daisuke Nishimura   memcg: clean up m...
4885
4886
  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
  						unsigned long addr, pte_t ptent)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4887
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4888
  	struct page *page = vm_normal_page(vma, addr, ptent);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4889

90254a658   Daisuke Nishimura   memcg: clean up m...
4890
4891
4892
4893
  	if (!page || !page_mapped(page))
  		return NULL;
  	if (PageAnon(page)) {
  		/* we don't move shared anon */
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4894
  		if (!move_anon())
90254a658   Daisuke Nishimura   memcg: clean up m...
4895
  			return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4896
4897
  	} else if (!move_file())
  		/* we ignore mapcount for file pages */
90254a658   Daisuke Nishimura   memcg: clean up m...
4898
4899
4900
4901
4902
4903
  		return NULL;
  	if (!get_page_unless_zero(page))
  		return NULL;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4904
  #ifdef CONFIG_SWAP
90254a658   Daisuke Nishimura   memcg: clean up m...
4905
4906
4907
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4908
4909
4910
4911
4912
  	struct page *page = NULL;
  	swp_entry_t ent = pte_to_swp_entry(ptent);
  
  	if (!move_anon() || non_swap_entry(ent))
  		return NULL;
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4913
4914
4915
4916
4917
  	/*
  	 * Because lookup_swap_cache() updates some statistics counter,
  	 * we call find_get_page() with swapper_space directly.
  	 */
  	page = find_get_page(&swapper_space, ent.val);
90254a658   Daisuke Nishimura   memcg: clean up m...
4918
4919
4920
4921
4922
  	if (do_swap_account)
  		entry->val = ent.val;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4923
4924
4925
4926
4927
4928
4929
  #else
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	return NULL;
  }
  #endif
90254a658   Daisuke Nishimura   memcg: clean up m...
4930

87946a722   Daisuke Nishimura   memcg: move charg...
4931
4932
4933
4934
  static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	struct page *page = NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4935
4936
4937
4938
4939
4940
4941
  	struct address_space *mapping;
  	pgoff_t pgoff;
  
  	if (!vma->vm_file) /* anonymous vma */
  		return NULL;
  	if (!move_file())
  		return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4942
4943
4944
4945
4946
4947
4948
  	mapping = vma->vm_file->f_mapping;
  	if (pte_none(ptent))
  		pgoff = linear_page_index(vma, addr);
  	else /* pte_file(ptent) is true */
  		pgoff = pte_to_pgoff(ptent);
  
  	/* page is moved even if it's not RSS of this task(page-faulted). */
aa3b18955   Hugh Dickins   tmpfs: convert me...
4949
4950
4951
4952
4953
4954
  	page = find_get_page(mapping, pgoff);
  
  #ifdef CONFIG_SWAP
  	/* shmem/tmpfs may report page out on swap: account for that too. */
  	if (radix_tree_exceptional_entry(page)) {
  		swp_entry_t swap = radix_to_swp_entry(page);
87946a722   Daisuke Nishimura   memcg: move charg...
4955
  		if (do_swap_account)
aa3b18955   Hugh Dickins   tmpfs: convert me...
4956
4957
  			*entry = swap;
  		page = find_get_page(&swapper_space, swap.val);
87946a722   Daisuke Nishimura   memcg: move charg...
4958
  	}
aa3b18955   Hugh Dickins   tmpfs: convert me...
4959
  #endif
87946a722   Daisuke Nishimura   memcg: move charg...
4960
4961
  	return page;
  }
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4962
  static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
90254a658   Daisuke Nishimura   memcg: clean up m...
4963
4964
4965
4966
  		unsigned long addr, pte_t ptent, union mc_target *target)
  {
  	struct page *page = NULL;
  	struct page_cgroup *pc;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4967
  	enum mc_target_type ret = MC_TARGET_NONE;
90254a658   Daisuke Nishimura   memcg: clean up m...
4968
4969
4970
4971
4972
4973
  	swp_entry_t ent = { .val = 0 };
  
  	if (pte_present(ptent))
  		page = mc_handle_present_pte(vma, addr, ptent);
  	else if (is_swap_pte(ptent))
  		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
87946a722   Daisuke Nishimura   memcg: move charg...
4974
4975
  	else if (pte_none(ptent) || pte_file(ptent))
  		page = mc_handle_file_pte(vma, addr, ptent, &ent);
90254a658   Daisuke Nishimura   memcg: clean up m...
4976
4977
  
  	if (!page && !ent.val)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4978
  		return ret;
024914477   Daisuke Nishimura   memcg: move charg...
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
  	if (page) {
  		pc = lookup_page_cgroup(page);
  		/*
  		 * Do only loose check w/o page_cgroup lock.
  		 * mem_cgroup_move_account() checks the pc is valid or not under
  		 * the lock.
  		 */
  		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
  			ret = MC_TARGET_PAGE;
  			if (target)
  				target->page = page;
  		}
  		if (!ret || !target)
  			put_page(page);
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4994
4995
  	/* There is a swap entry and a page doesn't exist or isn't charged */
  	if (ent.val && !ret &&
9fb4b7cc0   Bob Liu   page_cgroup: add ...
4996
  			css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4997
4998
4999
  		ret = MC_TARGET_SWAP;
  		if (target)
  			target->ent = ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5000
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5001
5002
  	return ret;
  }
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * We don't consider swapping or file mapped pages because THP does not
   * support them for now.
   * Caller should make sure that pmd_trans_huge(pmd) is true.
   */
  static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	struct page *page = NULL;
  	struct page_cgroup *pc;
  	enum mc_target_type ret = MC_TARGET_NONE;
  
  	page = pmd_page(pmd);
  	VM_BUG_ON(!page || !PageHead(page));
  	if (!move_anon())
  		return ret;
  	pc = lookup_page_cgroup(page);
  	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
  		ret = MC_TARGET_PAGE;
  		if (target) {
  			get_page(page);
  			target->page = page;
  		}
  	}
  	return ret;
  }
  #else
  static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	return MC_TARGET_NONE;
  }
  #endif
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5037
5038
5039
5040
5041
5042
5043
  static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  					unsigned long addr, unsigned long end,
  					struct mm_walk *walk)
  {
  	struct vm_area_struct *vma = walk->private;
  	pte_t *pte;
  	spinlock_t *ptl;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5044
5045
5046
5047
  	if (pmd_trans_huge_lock(pmd, vma) == 1) {
  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
  			mc.precharge += HPAGE_PMD_NR;
  		spin_unlock(&vma->vm_mm->page_table_lock);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
5048
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5049
  	}
033193275   Dave Hansen   pagewalk: only sp...
5050

45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
5051
5052
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5053
5054
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; pte++, addr += PAGE_SIZE)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5055
  		if (get_mctgt_type(vma, addr, *pte, NULL))
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5056
5057
5058
  			mc.precharge++;	/* increment precharge temporarily */
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
7dc74be03   Daisuke Nishimura   memcg: add interf...
5059
5060
  	return 0;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5061
5062
5063
5064
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
  	unsigned long precharge;
  	struct vm_area_struct *vma;
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5065
  	down_read(&mm->mmap_sem);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5066
5067
5068
5069
5070
5071
5072
5073
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  		struct mm_walk mem_cgroup_count_precharge_walk = {
  			.pmd_entry = mem_cgroup_count_precharge_pte_range,
  			.mm = mm,
  			.private = vma,
  		};
  		if (is_vm_hugetlb_page(vma))
  			continue;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5074
5075
5076
  		walk_page_range(vma->vm_start, vma->vm_end,
  					&mem_cgroup_count_precharge_walk);
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5077
  	up_read(&mm->mmap_sem);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5078
5079
5080
5081
5082
5083
  
  	precharge = mc.precharge;
  	mc.precharge = 0;
  
  	return precharge;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5084
5085
  static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5086
5087
5088
5089
5090
  	unsigned long precharge = mem_cgroup_count_precharge(mm);
  
  	VM_BUG_ON(mc.moving_task);
  	mc.moving_task = current;
  	return mem_cgroup_do_precharge(precharge);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5091
  }
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5092
5093
  /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
  static void __mem_cgroup_clear_mc(void)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5094
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5095
5096
  	struct mem_cgroup *from = mc.from;
  	struct mem_cgroup *to = mc.to;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5097
  	/* we must uncharge all the leftover precharges from mc.to */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
  	if (mc.precharge) {
  		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
  		mc.precharge = 0;
  	}
  	/*
  	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
  	 * we must uncharge here.
  	 */
  	if (mc.moved_charge) {
  		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
  		mc.moved_charge = 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5109
  	}
483c30b51   Daisuke Nishimura   memcg: improve pe...
5110
5111
  	/* we must fixup refcnts and charges */
  	if (mc.moved_swap) {
483c30b51   Daisuke Nishimura   memcg: improve pe...
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
  		/* uncharge swap account from the old cgroup */
  		if (!mem_cgroup_is_root(mc.from))
  			res_counter_uncharge(&mc.from->memsw,
  						PAGE_SIZE * mc.moved_swap);
  		__mem_cgroup_put(mc.from, mc.moved_swap);
  
  		if (!mem_cgroup_is_root(mc.to)) {
  			/*
  			 * we charged both to->res and to->memsw, so we should
  			 * uncharge to->res.
  			 */
  			res_counter_uncharge(&mc.to->res,
  						PAGE_SIZE * mc.moved_swap);
483c30b51   Daisuke Nishimura   memcg: improve pe...
5125
5126
  		}
  		/* we've already done mem_cgroup_get(mc.to) */
483c30b51   Daisuke Nishimura   memcg: improve pe...
5127
5128
  		mc.moved_swap = 0;
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
  	memcg_oom_recover(from);
  	memcg_oom_recover(to);
  	wake_up_all(&mc.waitq);
  }
  
  static void mem_cgroup_clear_mc(void)
  {
  	struct mem_cgroup *from = mc.from;
  
  	/*
  	 * we must clear moving_task before waking up waiters at the end of
  	 * task migration.
  	 */
  	mc.moving_task = NULL;
  	__mem_cgroup_clear_mc();
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5144
  	spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5145
5146
  	mc.from = NULL;
  	mc.to = NULL;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5147
  	spin_unlock(&mc.lock);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
5148
  	mem_cgroup_end_move(from);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5149
  }
761b3ef50   Li Zefan   cgroup: remove cg...
5150
5151
  static int mem_cgroup_can_attach(struct cgroup *cgroup,
  				 struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
5152
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
5153
  	struct task_struct *p = cgroup_taskset_first(tset);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5154
  	int ret = 0;
c0ff4b854   Raghavendra K T   memcg: rename mem...
5155
  	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5156

c0ff4b854   Raghavendra K T   memcg: rename mem...
5157
  	if (memcg->move_charge_at_immigrate) {
7dc74be03   Daisuke Nishimura   memcg: add interf...
5158
5159
  		struct mm_struct *mm;
  		struct mem_cgroup *from = mem_cgroup_from_task(p);
c0ff4b854   Raghavendra K T   memcg: rename mem...
5160
  		VM_BUG_ON(from == memcg);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5161
5162
5163
5164
  
  		mm = get_task_mm(p);
  		if (!mm)
  			return 0;
7dc74be03   Daisuke Nishimura   memcg: add interf...
5165
  		/* We move charges only when we move a owner of the mm */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5166
5167
5168
5169
  		if (mm->owner == p) {
  			VM_BUG_ON(mc.from);
  			VM_BUG_ON(mc.to);
  			VM_BUG_ON(mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5170
  			VM_BUG_ON(mc.moved_charge);
483c30b51   Daisuke Nishimura   memcg: improve pe...
5171
  			VM_BUG_ON(mc.moved_swap);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
5172
  			mem_cgroup_start_move(from);
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5173
  			spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5174
  			mc.from = from;
c0ff4b854   Raghavendra K T   memcg: rename mem...
5175
  			mc.to = memcg;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5176
  			spin_unlock(&mc.lock);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5177
  			/* We set mc.moving_task later */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5178
5179
5180
5181
  
  			ret = mem_cgroup_precharge_mc(mm);
  			if (ret)
  				mem_cgroup_clear_mc();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5182
5183
  		}
  		mmput(mm);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5184
5185
5186
  	}
  	return ret;
  }
761b3ef50   Li Zefan   cgroup: remove cg...
5187
5188
  static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
  				     struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
5189
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5190
  	mem_cgroup_clear_mc();
7dc74be03   Daisuke Nishimura   memcg: add interf...
5191
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5192
5193
5194
  static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
7dc74be03   Daisuke Nishimura   memcg: add interf...
5195
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5196
5197
5198
5199
  	int ret = 0;
  	struct vm_area_struct *vma = walk->private;
  	pte_t *pte;
  	spinlock_t *ptl;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5200
5201
5202
5203
  	enum mc_target_type target_type;
  	union mc_target target;
  	struct page *page;
  	struct page_cgroup *pc;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5204

12724850e   Naoya Horiguchi   memcg: avoid THP ...
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
  	/*
  	 * We don't take compound_lock() here but no race with splitting thp
  	 * happens because:
  	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
  	 *    under splitting, which means there's no concurrent thp split,
  	 *  - if another thread runs into split_huge_page() just after we
  	 *    entered this if-block, the thread must wait for page table lock
  	 *    to be unlocked in __split_huge_page_splitting(), where the main
  	 *    part of thp split is not executed yet.
  	 */
  	if (pmd_trans_huge_lock(pmd, vma) == 1) {
62ade86ab   Hugh Dickins   memcg,thp: fix re...
5216
  		if (mc.precharge < HPAGE_PMD_NR) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5217
5218
5219
5220
5221
5222
5223
5224
5225
  			spin_unlock(&vma->vm_mm->page_table_lock);
  			return 0;
  		}
  		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
  		if (target_type == MC_TARGET_PAGE) {
  			page = target.page;
  			if (!isolate_lru_page(page)) {
  				pc = lookup_page_cgroup(page);
  				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
5226
  							pc, mc.from, mc.to)) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5227
5228
5229
5230
5231
5232
5233
5234
  					mc.precharge -= HPAGE_PMD_NR;
  					mc.moved_charge += HPAGE_PMD_NR;
  				}
  				putback_lru_page(page);
  			}
  			put_page(page);
  		}
  		spin_unlock(&vma->vm_mm->page_table_lock);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
5235
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5236
  	}
45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
5237
5238
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5239
5240
5241
5242
  retry:
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; addr += PAGE_SIZE) {
  		pte_t ptent = *(pte++);
024914477   Daisuke Nishimura   memcg: move charg...
5243
  		swp_entry_t ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5244
5245
5246
  
  		if (!mc.precharge)
  			break;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5247
  		switch (get_mctgt_type(vma, addr, ptent, &target)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5248
5249
5250
5251
5252
  		case MC_TARGET_PAGE:
  			page = target.page;
  			if (isolate_lru_page(page))
  				goto put;
  			pc = lookup_page_cgroup(page);
7ec99d621   Johannes Weiner   memcg: unify char...
5253
  			if (!mem_cgroup_move_account(page, 1, pc,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
5254
  						     mc.from, mc.to)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5255
  				mc.precharge--;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5256
5257
  				/* we uncharge from mc.from later. */
  				mc.moved_charge++;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5258
5259
  			}
  			putback_lru_page(page);
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5260
  put:			/* get_mctgt_type() gets the page */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5261
5262
  			put_page(page);
  			break;
024914477   Daisuke Nishimura   memcg: move charg...
5263
5264
  		case MC_TARGET_SWAP:
  			ent = target.ent;
e91cbb425   Hugh Dickins   memcg swap: mem_c...
5265
  			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
024914477   Daisuke Nishimura   memcg: move charg...
5266
  				mc.precharge--;
483c30b51   Daisuke Nishimura   memcg: improve pe...
5267
5268
5269
  				/* we fixup refcnts and charges later. */
  				mc.moved_swap++;
  			}
024914477   Daisuke Nishimura   memcg: move charg...
5270
  			break;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
  		default:
  			break;
  		}
  	}
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
  
  	if (addr != end) {
  		/*
  		 * We have consumed all precharges we got in can_attach().
  		 * We try charge one by one, but don't do any additional
  		 * charges to mc.to if we have failed in charge once in attach()
  		 * phase.
  		 */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5285
  		ret = mem_cgroup_do_precharge(1);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
  		if (!ret)
  			goto retry;
  	}
  
  	return ret;
  }
  
  static void mem_cgroup_move_charge(struct mm_struct *mm)
  {
  	struct vm_area_struct *vma;
  
  	lru_add_drain_all();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
  retry:
  	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  		/*
  		 * Someone who are holding the mmap_sem might be waiting in
  		 * waitq. So we cancel all extra charges, wake up all waiters,
  		 * and retry. Because we cancel precharges, we might not be able
  		 * to move enough charges, but moving charge is a best-effort
  		 * feature anyway, so it wouldn't be a big problem.
  		 */
  		__mem_cgroup_clear_mc();
  		cond_resched();
  		goto retry;
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5311
5312
5313
5314
5315
5316
5317
5318
5319
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  		int ret;
  		struct mm_walk mem_cgroup_move_charge_walk = {
  			.pmd_entry = mem_cgroup_move_charge_pte_range,
  			.mm = mm,
  			.private = vma,
  		};
  		if (is_vm_hugetlb_page(vma))
  			continue;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5320
5321
5322
5323
5324
5325
5326
5327
5328
  		ret = walk_page_range(vma->vm_start, vma->vm_end,
  						&mem_cgroup_move_charge_walk);
  		if (ret)
  			/*
  			 * means we have consumed all precharges and failed in
  			 * doing additional charge. Just abandon here.
  			 */
  			break;
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5329
  	up_read(&mm->mmap_sem);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5330
  }
761b3ef50   Li Zefan   cgroup: remove cg...
5331
5332
  static void mem_cgroup_move_task(struct cgroup *cont,
  				 struct cgroup_taskset *tset)
67e465a77   Balbir Singh   Memory controller...
5333
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
5334
  	struct task_struct *p = cgroup_taskset_first(tset);
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5335
  	struct mm_struct *mm = get_task_mm(p);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5336

dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5337
  	if (mm) {
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5338
5339
  		if (mc.to)
  			mem_cgroup_move_charge(mm);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5340
5341
  		mmput(mm);
  	}
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5342
5343
  	if (mc.to)
  		mem_cgroup_clear_mc();
67e465a77   Balbir Singh   Memory controller...
5344
  }
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5345
  #else	/* !CONFIG_MMU */
761b3ef50   Li Zefan   cgroup: remove cg...
5346
5347
  static int mem_cgroup_can_attach(struct cgroup *cgroup,
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5348
5349
5350
  {
  	return 0;
  }
761b3ef50   Li Zefan   cgroup: remove cg...
5351
5352
  static void mem_cgroup_cancel_attach(struct cgroup *cgroup,
  				     struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5353
5354
  {
  }
761b3ef50   Li Zefan   cgroup: remove cg...
5355
5356
  static void mem_cgroup_move_task(struct cgroup *cont,
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5357
5358
5359
  {
  }
  #endif
67e465a77   Balbir Singh   Memory controller...
5360

8cdea7c05   Balbir Singh   Memory controller...
5361
5362
5363
5364
  struct cgroup_subsys mem_cgroup_subsys = {
  	.name = "memory",
  	.subsys_id = mem_cgroup_subsys_id,
  	.create = mem_cgroup_create,
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
5365
  	.pre_destroy = mem_cgroup_pre_destroy,
8cdea7c05   Balbir Singh   Memory controller...
5366
  	.destroy = mem_cgroup_destroy,
7dc74be03   Daisuke Nishimura   memcg: add interf...
5367
5368
  	.can_attach = mem_cgroup_can_attach,
  	.cancel_attach = mem_cgroup_cancel_attach,
67e465a77   Balbir Singh   Memory controller...
5369
  	.attach = mem_cgroup_move_task,
6bc103498   Tejun Heo   cgroup: convert m...
5370
  	.base_cftypes = mem_cgroup_files,
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5371
  	.early_init = 0,
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
5372
  	.use_id = 1,
48ddbe194   Tejun Heo   cgroup: make css-...
5373
  	.__DEPRECATED_clear_css_refs = true,
8cdea7c05   Balbir Singh   Memory controller...
5374
  };
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5375

c255a4580   Andrew Morton   memcg: rename con...
5376
  #ifdef CONFIG_MEMCG_SWAP
a42c390cf   Michal Hocko   cgroups: make swa...
5377
5378
5379
  static int __init enable_swap_account(char *s)
  {
  	/* consider enabled if no parameter or 1 is given */
a2c8990ae   Michal Hocko   memsw: remove nos...
5380
  	if (!strcmp(s, "1"))
a42c390cf   Michal Hocko   cgroups: make swa...
5381
  		really_do_swap_account = 1;
a2c8990ae   Michal Hocko   memsw: remove nos...
5382
  	else if (!strcmp(s, "0"))
a42c390cf   Michal Hocko   cgroups: make swa...
5383
5384
5385
  		really_do_swap_account = 0;
  	return 1;
  }
a2c8990ae   Michal Hocko   memsw: remove nos...
5386
  __setup("swapaccount=", enable_swap_account);
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5387

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5388
  #endif