Blame view

mm/memcontrol.c 174 KB
8cdea7c05   Balbir Singh   Memory controller...
1
2
3
4
5
  /* memcontrol.c - Memory Controller
   *
   * Copyright IBM Corporation, 2007
   * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   *
78fb74669   Pavel Emelianov   Memory controller...
6
7
8
   * Copyright 2007 OpenVZ SWsoft Inc
   * Author: Pavel Emelianov <xemul@openvz.org>
   *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
9
10
11
12
   * Memory thresholds
   * Copyright (C) 2009 Nokia Corporation
   * Author: Kirill A. Shutemov
   *
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
13
14
15
16
   * Kernel Memory Controller
   * Copyright (C) 2012 Parallels Inc. and Google Inc.
   * Authors: Glauber Costa and Suleiman Souhlal
   *
8cdea7c05   Balbir Singh   Memory controller...
17
18
19
20
21
22
23
24
25
26
27
28
29
30
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   */
  
  #include <linux/res_counter.h>
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
78fb74669   Pavel Emelianov   Memory controller...
31
  #include <linux/mm.h>
4ffef5fef   Daisuke Nishimura   memcg: move charg...
32
  #include <linux/hugetlb.h>
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
33
  #include <linux/pagemap.h>
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
34
  #include <linux/smp.h>
8a9f3ccd2   Balbir Singh   Memory controller...
35
  #include <linux/page-flags.h>
66e1707bc   Balbir Singh   Memory controller...
36
  #include <linux/backing-dev.h>
8a9f3ccd2   Balbir Singh   Memory controller...
37
38
  #include <linux/bit_spinlock.h>
  #include <linux/rcupdate.h>
e222432bf   Balbir Singh   memcg: show memcg...
39
  #include <linux/limits.h>
b9e15bafd   Paul Gortmaker   mm: Add export.h ...
40
  #include <linux/export.h>
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
41
  #include <linux/mutex.h>
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
42
  #include <linux/rbtree.h>
b6ac57d50   Balbir Singh   memcgroup: move m...
43
  #include <linux/slab.h>
66e1707bc   Balbir Singh   Memory controller...
44
  #include <linux/swap.h>
024914477   Daisuke Nishimura   memcg: move charg...
45
  #include <linux/swapops.h>
66e1707bc   Balbir Singh   Memory controller...
46
  #include <linux/spinlock.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
47
  #include <linux/eventfd.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
48
  #include <linux/poll.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
49
  #include <linux/sort.h>
66e1707bc   Balbir Singh   Memory controller...
50
  #include <linux/fs.h>
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
51
  #include <linux/seq_file.h>
70ddf637e   Anton Vorontsov   memcg: add memory...
52
  #include <linux/vmpressure.h>
b69408e88   Christoph Lameter   vmscan: Use an in...
53
  #include <linux/mm_inline.h>
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
54
  #include <linux/page_cgroup.h>
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
55
  #include <linux/cpu.h>
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
56
  #include <linux/oom.h>
0056f4e66   Johannes Weiner   mm: memcg: lockde...
57
  #include <linux/lockdep.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
58
  #include <linux/file.h>
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
59
  #include "internal.h"
d1a4c0b37   Glauber Costa   tcp memory pressu...
60
  #include <net/sock.h>
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
61
  #include <net/ip.h>
d1a4c0b37   Glauber Costa   tcp memory pressu...
62
  #include <net/tcp_memcontrol.h>
f35c3a8ee   Qiang Huang   memcg, kmem: use ...
63
  #include "slab.h"
8cdea7c05   Balbir Singh   Memory controller...
64

8697d3319   Balbir Singh   Memory controller...
65
  #include <asm/uaccess.h>
cc8e970c3   KOSAKI Motohiro   memcg: add mm_vms...
66
  #include <trace/events/vmscan.h>
073219e99   Tejun Heo   cgroup: clean up ...
67
68
  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  EXPORT_SYMBOL(memory_cgrp_subsys);
68ae564bb   David Rientjes   mm, memcg: avoid ...
69

a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
70
  #define MEM_CGROUP_RECLAIM_RETRIES	5
6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
71
  static struct mem_cgroup *root_mem_cgroup __read_mostly;
8cdea7c05   Balbir Singh   Memory controller...
72

c255a4580   Andrew Morton   memcg: rename con...
73
  #ifdef CONFIG_MEMCG_SWAP
338c84310   Li Zefan   memcg: remove som...
74
  /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
75
  int do_swap_account __read_mostly;
a42c390cf   Michal Hocko   cgroups: make swa...
76
77
  
  /* for remember boot option*/
c255a4580   Andrew Morton   memcg: rename con...
78
  #ifdef CONFIG_MEMCG_SWAP_ENABLED
a42c390cf   Michal Hocko   cgroups: make swa...
79
80
  static int really_do_swap_account __initdata = 1;
  #else
ada4ba591   Fabian Frederick   mm/memcontrol.c: ...
81
  static int really_do_swap_account __initdata;
a42c390cf   Michal Hocko   cgroups: make swa...
82
  #endif
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
83
  #else
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
84
  #define do_swap_account		0
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
85
  #endif
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
86
87
88
  static const char * const mem_cgroup_stat_names[] = {
  	"cache",
  	"rss",
b070e65c0   David Rientjes   mm, memcg: add rs...
89
  	"rss_huge",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
90
  	"mapped_file",
3ea67d06e   Sha Zhengju   memcg: add per cg...
91
  	"writeback",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
92
93
  	"swap",
  };
e9f8974f2   Johannes Weiner   memcg: break out ...
94
95
96
  enum mem_cgroup_events_index {
  	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
  	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
456f998ec   Ying Han   memcg: add the pa...
97
98
  	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
  	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
e9f8974f2   Johannes Weiner   memcg: break out ...
99
100
  	MEM_CGROUP_EVENTS_NSTATS,
  };
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
101
102
103
104
105
106
107
  
  static const char * const mem_cgroup_events_names[] = {
  	"pgpgin",
  	"pgpgout",
  	"pgfault",
  	"pgmajfault",
  };
58cf188ed   Sha Zhengju   memcg, oom: provi...
108
109
110
111
112
113
114
  static const char * const mem_cgroup_lru_names[] = {
  	"inactive_anon",
  	"active_anon",
  	"inactive_file",
  	"active_file",
  	"unevictable",
  };
7a159cc9d   Johannes Weiner   memcg: use native...
115
116
117
118
119
120
121
122
  /*
   * Per memcg event counter is incremented at every pagein/pageout. With THP,
   * it will be incremated by the number of pages. This counter is used for
   * for trigger some periodic events. This is straightforward and better
   * than using jiffies etc. to handle periodic memcg event.
   */
  enum mem_cgroup_events_target {
  	MEM_CGROUP_TARGET_THRESH,
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
123
  	MEM_CGROUP_TARGET_SOFTLIMIT,
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
124
  	MEM_CGROUP_TARGET_NUMAINFO,
7a159cc9d   Johannes Weiner   memcg: use native...
125
126
  	MEM_CGROUP_NTARGETS,
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
127
128
129
  #define THRESHOLDS_EVENTS_TARGET 128
  #define SOFTLIMIT_EVENTS_TARGET 1024
  #define NUMAINFO_EVENTS_TARGET	1024
e9f8974f2   Johannes Weiner   memcg: break out ...
130

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
131
  struct mem_cgroup_stat_cpu {
7a159cc9d   Johannes Weiner   memcg: use native...
132
  	long count[MEM_CGROUP_STAT_NSTATS];
e9f8974f2   Johannes Weiner   memcg: break out ...
133
  	unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
13114716c   Johannes Weiner   mm: memcg: keep r...
134
  	unsigned long nr_page_events;
7a159cc9d   Johannes Weiner   memcg: use native...
135
  	unsigned long targets[MEM_CGROUP_NTARGETS];
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
136
  };
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
137
  struct mem_cgroup_reclaim_iter {
5f5781619   Michal Hocko   memcg: relax memc...
138
139
140
141
  	/*
  	 * last scanned hierarchy member. Valid only if last_dead_count
  	 * matches memcg->dead_count of the hierarchy root group.
  	 */
542f85f9a   Michal Hocko   memcg: rework mem...
142
  	struct mem_cgroup *last_visited;
d2ab70aaa   Hugh Dickins   mm/memcg: fix las...
143
  	int last_dead_count;
5f5781619   Michal Hocko   memcg: relax memc...
144

527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
145
146
147
  	/* scan generation, increased every round-trip */
  	unsigned int generation;
  };
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
148
  /*
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
149
150
   * per-zone information in memory controller.
   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
151
  struct mem_cgroup_per_zone {
6290df545   Johannes Weiner   mm: collect LRU l...
152
  	struct lruvec		lruvec;
1eb492725   Hugh Dickins   memcg: lru_size i...
153
  	unsigned long		lru_size[NR_LRU_LISTS];
3e2f41f1f   KOSAKI Motohiro   memcg: add zone_r...
154

527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
155
  	struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
156
157
158
159
  	struct rb_node		tree_node;	/* RB tree node */
  	unsigned long long	usage_in_excess;/* Set to the value by which */
  						/* the soft limit is exceeded*/
  	bool			on_tree;
d79154bb5   Hugh Dickins   memcg: replace me...
160
  	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
4e4169535   Balbir Singh   memory controller...
161
  						/* use container_of	   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
162
  };
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
163
164
165
166
  
  struct mem_cgroup_per_node {
  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
  /*
   * Cgroups above their limits are maintained in a RB-Tree, independent of
   * their hierarchy representation
   */
  
  struct mem_cgroup_tree_per_zone {
  	struct rb_root rb_root;
  	spinlock_t lock;
  };
  
  struct mem_cgroup_tree_per_node {
  	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_tree {
  	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
  };
  
  static struct mem_cgroup_tree soft_limit_tree __read_mostly;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
186
187
188
189
  struct mem_cgroup_threshold {
  	struct eventfd_ctx *eventfd;
  	u64 threshold;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
190
  /* For threshold */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
191
  struct mem_cgroup_threshold_ary {
748dad36d   Sha Zhengju   memcg: make thres...
192
  	/* An array index points to threshold just below or equal to usage. */
5407a5625   Phil Carmody   mm: remove unnece...
193
  	int current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
194
195
196
197
198
  	/* Size of entries[] */
  	unsigned int size;
  	/* Array of thresholds */
  	struct mem_cgroup_threshold entries[0];
  };
2c488db27   Kirill A. Shutemov   memcg: clean up m...
199
200
201
202
203
204
205
206
207
208
209
  
  struct mem_cgroup_thresholds {
  	/* Primary thresholds array */
  	struct mem_cgroup_threshold_ary *primary;
  	/*
  	 * Spare threshold array.
  	 * This is needed to make mem_cgroup_unregister_event() "never fail".
  	 * It must be able to store at least primary->size - 1 entries.
  	 */
  	struct mem_cgroup_threshold_ary *spare;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
210
211
212
213
214
  /* for OOM */
  struct mem_cgroup_eventfd_list {
  	struct list_head list;
  	struct eventfd_ctx *eventfd;
  };
2e72b6347   Kirill A. Shutemov   memcg: implement ...
215

79bd9814e   Tejun Heo   cgroup, memcg: mo...
216
217
218
  /*
   * cgroup_event represents events which userspace want to receive.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
219
  struct mem_cgroup_event {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
220
  	/*
59b6f8734   Tejun Heo   memcg: make cgrou...
221
  	 * memcg which the event belongs to.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
222
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
223
  	struct mem_cgroup *memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
224
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
225
226
227
228
229
230
231
232
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
233
234
235
236
  	 * register_event() callback will be used to add new userspace
  	 * waiter for changes related to this event.  Use eventfd_signal()
  	 * on eventfd to send notification to userspace.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
237
  	int (*register_event)(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
238
  			      struct eventfd_ctx *eventfd, const char *args);
fba948078   Tejun Heo   cgroup, memcg: mo...
239
240
241
242
243
  	/*
  	 * unregister_event() callback will be called when userspace closes
  	 * the eventfd or on cgroup removing.  This callback must be set,
  	 * if you want provide notification functionality.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
244
  	void (*unregister_event)(struct mem_cgroup *memcg,
fba948078   Tejun Heo   cgroup, memcg: mo...
245
246
  				 struct eventfd_ctx *eventfd);
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
247
248
249
250
251
252
253
254
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
c0ff4b854   Raghavendra K T   memcg: rename mem...
255
256
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
257

f64c3f549   Balbir Singh   memory controller...
258
  /*
8cdea7c05   Balbir Singh   Memory controller...
259
260
261
262
263
264
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
   * statistics based on the statistics developed by Rik Van Riel for clock-pro,
   * to help the administrator determine what knobs to tune.
   *
   * TODO: Add a water mark for the memory controller. Reclaim will begin when
8a9f3ccd2   Balbir Singh   Memory controller...
265
266
267
   * we hit the water mark. May be even add a low water mark, such that
   * no reclaim occurs from a cgroup at it's low water mark, this is
   * a feature that will be implemented much later in the future.
8cdea7c05   Balbir Singh   Memory controller...
268
269
270
271
272
273
274
   */
  struct mem_cgroup {
  	struct cgroup_subsys_state css;
  	/*
  	 * the counter to account for memory usage
  	 */
  	struct res_counter res;
59927fb98   Hugh Dickins   memcg: free mem_c...
275

70ddf637e   Anton Vorontsov   memcg: add memory...
276
277
  	/* vmpressure notifications */
  	struct vmpressure vmpressure;
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
278
279
  	/* css_online() has been completed */
  	int initialized;
465939a1f   Li Zefan   memcg: don't need...
280
281
282
283
  	/*
  	 * the counter to account for mem+swap usage.
  	 */
  	struct res_counter memsw;
59927fb98   Hugh Dickins   memcg: free mem_c...
284

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
285
  	/*
510fc4e11   Glauber Costa   memcg: kmem accou...
286
287
288
289
  	 * the counter to account for kernel memory usage.
  	 */
  	struct res_counter kmem;
  	/*
18f59ea7d   Balbir Singh   memcg: memory cgr...
290
291
292
  	 * Should the accounting and control be hierarchical, per subtree?
  	 */
  	bool use_hierarchy;
510fc4e11   Glauber Costa   memcg: kmem accou...
293
  	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
79dfdaccd   Michal Hocko   memcg: make oom_l...
294
295
296
  
  	bool		oom_lock;
  	atomic_t	under_oom;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
297
  	atomic_t	oom_wakeups;
79dfdaccd   Michal Hocko   memcg: make oom_l...
298

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
299
  	int	swappiness;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
300
301
  	/* OOM-Killer disable */
  	int		oom_kill_disable;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
302

2e72b6347   Kirill A. Shutemov   memcg: implement ...
303
304
305
306
  	/* protect arrays of thresholds */
  	struct mutex thresholds_lock;
  
  	/* thresholds for memory usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
307
  	struct mem_cgroup_thresholds thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
308

2e72b6347   Kirill A. Shutemov   memcg: implement ...
309
  	/* thresholds for mem+swap usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
310
  	struct mem_cgroup_thresholds memsw_thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
311

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
312
313
  	/* For oom notifier event fd */
  	struct list_head oom_notify;
185efc0f9   Johannes Weiner   memcg: Revert "me...
314

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
315
  	/*
7dc74be03   Daisuke Nishimura   memcg: add interf...
316
317
318
  	 * Should we move charges of a task when a task is moved into this
  	 * mem_cgroup ? And what type of charges should we move ?
  	 */
f894ffa86   Andrew Morton   memcg: trivial cl...
319
  	unsigned long move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
320
  	/*
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
321
322
323
  	 * set > 0 if pages under this cgroup are moving to other cgroup.
  	 */
  	atomic_t	moving_account;
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
324
325
  	/* taken only while moving_account > 0 */
  	spinlock_t	move_lock;
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
326
  	/*
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
327
  	 * percpu counter.
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
328
  	 */
3a7951b4c   Kirill A. Shutemov   memcg: mark stat ...
329
  	struct mem_cgroup_stat_cpu __percpu *stat;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
330
331
332
333
334
335
  	/*
  	 * used when a cpu is offlined or other synchronizations
  	 * See mem_cgroup_read_stat().
  	 */
  	struct mem_cgroup_stat_cpu nocpu_base;
  	spinlock_t pcp_counter_lock;
d1a4c0b37   Glauber Costa   tcp memory pressu...
336

5f5781619   Michal Hocko   memcg: relax memc...
337
  	atomic_t	dead_count;
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
338
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
2e685cad5   Eric W. Biederman   tcp_memcontrol: K...
339
  	struct cg_proto tcp_mem;
d1a4c0b37   Glauber Costa   tcp memory pressu...
340
  #endif
2633d7a02   Glauber Costa   slab/slub: consid...
341
  #if defined(CONFIG_MEMCG_KMEM)
bd6731458   Vladimir Davydov   memcg, slab: simp...
342
343
  	/* analogous to slab_common's slab_caches list, but per-memcg;
  	 * protected by memcg_slab_mutex */
2633d7a02   Glauber Costa   slab/slub: consid...
344
  	struct list_head memcg_slab_caches;
2633d7a02   Glauber Costa   slab/slub: consid...
345
346
347
          /* Index in the kmem_cache->memcg_params->memcg_caches array */
  	int kmemcg_id;
  #endif
45cf7ebd5   Glauber Costa   memcg: reduce the...
348
349
350
351
352
353
354
  
  	int last_scanned_node;
  #if MAX_NUMNODES > 1
  	nodemask_t	scan_nodes;
  	atomic_t	numainfo_events;
  	atomic_t	numainfo_updating;
  #endif
70ddf637e   Anton Vorontsov   memcg: add memory...
355

fba948078   Tejun Heo   cgroup, memcg: mo...
356
357
358
  	/* List of events which userspace want to receive */
  	struct list_head event_list;
  	spinlock_t event_list_lock;
54f72fe02   Johannes Weiner   memcg: clean up m...
359
360
  	struct mem_cgroup_per_node *nodeinfo[0];
  	/* WARNING: nodeinfo must be the last member here */
8cdea7c05   Balbir Singh   Memory controller...
361
  };
510fc4e11   Glauber Costa   memcg: kmem accou...
362
363
  /* internal only representation about the status of kmem accounting. */
  enum {
6de64beb3   Vladimir Davydov   memcg: remove KME...
364
  	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
7de37682b   Glauber Costa   memcg: kmem accou...
365
  	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
510fc4e11   Glauber Costa   memcg: kmem accou...
366
  };
510fc4e11   Glauber Costa   memcg: kmem accou...
367
368
369
370
371
  #ifdef CONFIG_MEMCG_KMEM
  static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
  {
  	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
  }
7de37682b   Glauber Costa   memcg: kmem accou...
372
373
374
375
376
377
378
379
  
  static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
  {
  	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
  }
  
  static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
  {
10d5ebf40   Li Zefan   memcg: use css_ge...
380
381
382
383
384
  	/*
  	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
  	 * will call css_put() if it sees the memcg is dead.
  	 */
  	smp_wmb();
7de37682b   Glauber Costa   memcg: kmem accou...
385
386
387
388
389
390
391
392
393
  	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
  		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
  }
  
  static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
  {
  	return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
  				  &memcg->kmem_account_flags);
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
394
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
395
396
  /* Stuffs for move charges at task migration. */
  /*
ee5e8472b   Glauber Costa   memcg: prevent ch...
397
398
   * Types of charges to be moved. "move_charge_at_immitgrate" and
   * "immigrate_flags" are treated as a left-shifted bitmap of these types.
7dc74be03   Daisuke Nishimura   memcg: add interf...
399
400
   */
  enum move_type {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
401
  	MOVE_CHARGE_TYPE_ANON,	/* private anonymous page and swap of it */
87946a722   Daisuke Nishimura   memcg: move charg...
402
  	MOVE_CHARGE_TYPE_FILE,	/* file page(including tmpfs) and swap of it */
7dc74be03   Daisuke Nishimura   memcg: add interf...
403
404
  	NR_MOVE_TYPE,
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
405
406
  /* "mc" and its members are protected by cgroup_mutex */
  static struct move_charge_struct {
b1dd693e5   Daisuke Nishimura   memcg: avoid dead...
407
  	spinlock_t	  lock; /* for from, to */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
408
409
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
ee5e8472b   Glauber Costa   memcg: prevent ch...
410
  	unsigned long immigrate_flags;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
411
  	unsigned long precharge;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
412
  	unsigned long moved_charge;
483c30b51   Daisuke Nishimura   memcg: improve pe...
413
  	unsigned long moved_swap;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
414
415
416
  	struct task_struct *moving_task;	/* a task moving charges */
  	wait_queue_head_t waitq;		/* a waitq for other context */
  } mc = {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
417
  	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
418
419
  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
420

90254a658   Daisuke Nishimura   memcg: clean up m...
421
422
  static bool move_anon(void)
  {
ee5e8472b   Glauber Costa   memcg: prevent ch...
423
  	return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
90254a658   Daisuke Nishimura   memcg: clean up m...
424
  }
87946a722   Daisuke Nishimura   memcg: move charg...
425
426
  static bool move_file(void)
  {
ee5e8472b   Glauber Costa   memcg: prevent ch...
427
  	return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
87946a722   Daisuke Nishimura   memcg: move charg...
428
  }
4e4169535   Balbir Singh   memory controller...
429
430
431
432
  /*
   * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
433
  #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
434
  #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
4e4169535   Balbir Singh   memory controller...
435

217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
436
437
  enum charge_type {
  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
438
  	MEM_CGROUP_CHARGE_TYPE_ANON,
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
439
  	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
440
  	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
441
442
  	NR_CHARGE_TYPE,
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
443
  /* for encoding cft->private value on file */
86ae53e1a   Glauber Costa   memcg: change def...
444
445
446
447
  enum res_type {
  	_MEM,
  	_MEMSWAP,
  	_OOM_TYPE,
510fc4e11   Glauber Costa   memcg: kmem accou...
448
  	_KMEM,
86ae53e1a   Glauber Costa   memcg: change def...
449
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
450
451
  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
452
  #define MEMFILE_ATTR(val)	((val) & 0xffff)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
453
454
  /* Used for OOM nofiier */
  #define OOM_CONTROL		(0)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
455

75822b449   Balbir Singh   memory controller...
456
  /*
0999821b1   Glauber Costa   memcg: replace cg...
457
458
459
460
461
   * The memcg_create_mutex will be held whenever a new cgroup is created.
   * As a consequence, any change that needs to protect against new child cgroups
   * appearing has to hold it as well.
   */
  static DEFINE_MUTEX(memcg_create_mutex);
b21451459   Wanpeng Li   memcg: add mem_cg...
462
463
  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
  {
a7c6d554a   Tejun Heo   cgroup: add/updat...
464
  	return s ? container_of(s, struct mem_cgroup, css) : NULL;
b21451459   Wanpeng Li   memcg: add mem_cg...
465
  }
70ddf637e   Anton Vorontsov   memcg: add memory...
466
467
468
469
470
471
472
473
474
475
476
477
  /* Some nice accessors for the vmpressure. */
  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
  {
  	if (!memcg)
  		memcg = root_mem_cgroup;
  	return &memcg->vmpressure;
  }
  
  struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
  {
  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
  }
7ffc0edc4   Michal Hocko   memcg: move mem_c...
478
479
480
481
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
  	return (memcg == root_mem_cgroup);
  }
4219b2da2   Li Zefan   memcg: fail to cr...
482
483
484
485
486
  /*
   * We restrict the id in the range of [1, 65535], so it can fit into
   * an unsigned short.
   */
  #define MEM_CGROUP_ID_MAX	USHRT_MAX
34c00c319   Li Zefan   memcg: convert to...
487
488
  static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
  {
15a4c835e   Tejun Heo   cgroup, memcg: im...
489
  	return memcg->css.id;
34c00c319   Li Zefan   memcg: convert to...
490
491
492
493
494
  }
  
  static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
  {
  	struct cgroup_subsys_state *css;
7d699ddb2   Tejun Heo   cgroup, memcg: al...
495
  	css = css_from_id(id, &memory_cgrp_subsys);
34c00c319   Li Zefan   memcg: convert to...
496
497
  	return mem_cgroup_from_css(css);
  }
e1aab161e   Glauber Costa   socket: initial c...
498
  /* Writing them here to avoid exposing memcg's inner layout */
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
499
  #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
e1aab161e   Glauber Costa   socket: initial c...
500

e1aab161e   Glauber Costa   socket: initial c...
501
502
  void sock_update_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
503
  	if (mem_cgroup_sockets_enabled) {
e1aab161e   Glauber Costa   socket: initial c...
504
  		struct mem_cgroup *memcg;
3f1346193   Glauber Costa   memcg: decrement ...
505
  		struct cg_proto *cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
506
507
  
  		BUG_ON(!sk->sk_prot->proto_cgroup);
f3f511e1c   Glauber Costa   net: fix sock_clo...
508
509
510
511
512
513
514
515
516
517
  		/* Socket cloning can throw us here with sk_cgrp already
  		 * filled. It won't however, necessarily happen from
  		 * process context. So the test for root memcg given
  		 * the current task's memcg won't help us in this case.
  		 *
  		 * Respecting the original socket's memcg is a better
  		 * decision in this case.
  		 */
  		if (sk->sk_cgrp) {
  			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
5347e5ae1   Li Zefan   memcg: use css_ge...
518
  			css_get(&sk->sk_cgrp->memcg->css);
f3f511e1c   Glauber Costa   net: fix sock_clo...
519
520
  			return;
  		}
e1aab161e   Glauber Costa   socket: initial c...
521
522
  		rcu_read_lock();
  		memcg = mem_cgroup_from_task(current);
3f1346193   Glauber Costa   memcg: decrement ...
523
  		cg_proto = sk->sk_prot->proto_cgroup(memcg);
5347e5ae1   Li Zefan   memcg: use css_ge...
524
  		if (!mem_cgroup_is_root(memcg) &&
ec903c0c8   Tejun Heo   cgroup: rename cs...
525
526
  		    memcg_proto_active(cg_proto) &&
  		    css_tryget_online(&memcg->css)) {
3f1346193   Glauber Costa   memcg: decrement ...
527
  			sk->sk_cgrp = cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
528
529
530
531
532
533
534
535
  		}
  		rcu_read_unlock();
  	}
  }
  EXPORT_SYMBOL(sock_update_memcg);
  
  void sock_release_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
536
  	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
e1aab161e   Glauber Costa   socket: initial c...
537
538
539
  		struct mem_cgroup *memcg;
  		WARN_ON(!sk->sk_cgrp->memcg);
  		memcg = sk->sk_cgrp->memcg;
5347e5ae1   Li Zefan   memcg: use css_ge...
540
  		css_put(&sk->sk_cgrp->memcg->css);
e1aab161e   Glauber Costa   socket: initial c...
541
542
  	}
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
543
544
545
546
547
  
  struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
  {
  	if (!memcg || mem_cgroup_is_root(memcg))
  		return NULL;
2e685cad5   Eric W. Biederman   tcp_memcontrol: K...
548
  	return &memcg->tcp_mem;
d1a4c0b37   Glauber Costa   tcp memory pressu...
549
550
  }
  EXPORT_SYMBOL(tcp_proto_cgroup);
e1aab161e   Glauber Costa   socket: initial c...
551

3f1346193   Glauber Costa   memcg: decrement ...
552
553
  static void disarm_sock_keys(struct mem_cgroup *memcg)
  {
2e685cad5   Eric W. Biederman   tcp_memcontrol: K...
554
  	if (!memcg_proto_activated(&memcg->tcp_mem))
3f1346193   Glauber Costa   memcg: decrement ...
555
556
557
558
559
560
561
562
  		return;
  	static_key_slow_dec(&memcg_socket_limit_enabled);
  }
  #else
  static void disarm_sock_keys(struct mem_cgroup *memcg)
  {
  }
  #endif
a8964b9b8   Glauber Costa   memcg: use static...
563
  #ifdef CONFIG_MEMCG_KMEM
55007d849   Glauber Costa   memcg: allocate m...
564
565
  /*
   * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
b86278359   Li Zefan   memcg: stop using...
566
567
568
569
570
   * The main reason for not using cgroup id for this:
   *  this works better in sparse environments, where we have a lot of memcgs,
   *  but only a few kmem-limited. Or also, if we have, for instance, 200
   *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
   *  200 entry array for that.
55007d849   Glauber Costa   memcg: allocate m...
571
572
573
574
575
576
   *
   * The current size of the caches array is stored in
   * memcg_limited_groups_array_size.  It will double each time we have to
   * increase it.
   */
  static DEFINE_IDA(kmem_limited_groups);
749c54151   Glauber Costa   memcg: aggregate ...
577
  int memcg_limited_groups_array_size;
55007d849   Glauber Costa   memcg: allocate m...
578
579
580
581
582
583
  /*
   * MIN_SIZE is different than 1, because we would like to avoid going through
   * the alloc/free process all the time. In a small machine, 4 kmem-limited
   * cgroups is a reasonable guess. In the future, it could be a parameter or
   * tunable, but that is strictly not necessary.
   *
b86278359   Li Zefan   memcg: stop using...
584
   * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
55007d849   Glauber Costa   memcg: allocate m...
585
586
   * this constant directly from cgroup, but it is understandable that this is
   * better kept as an internal representation in cgroup.c. In any case, the
b86278359   Li Zefan   memcg: stop using...
587
   * cgrp_id space is not getting any smaller, and we don't have to necessarily
55007d849   Glauber Costa   memcg: allocate m...
588
589
590
   * increase ours as well if it increases.
   */
  #define MEMCG_CACHES_MIN_SIZE 4
b86278359   Li Zefan   memcg: stop using...
591
  #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
55007d849   Glauber Costa   memcg: allocate m...
592

d7f25f8a2   Glauber Costa   memcg: infrastruc...
593
594
595
596
597
598
  /*
   * A lot of the calls to the cache allocation functions are expected to be
   * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
   * conditional to this static branch, we'll have to allow modules that does
   * kmem_cache_alloc and the such to see this symbol as well
   */
a8964b9b8   Glauber Costa   memcg: use static...
599
  struct static_key memcg_kmem_enabled_key;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
600
  EXPORT_SYMBOL(memcg_kmem_enabled_key);
a8964b9b8   Glauber Costa   memcg: use static...
601

f3bb3043a   Vladimir Davydov   memcg: don't call...
602
  static void memcg_free_cache_id(int id);
a8964b9b8   Glauber Costa   memcg: use static...
603
604
  static void disarm_kmem_keys(struct mem_cgroup *memcg)
  {
55007d849   Glauber Costa   memcg: allocate m...
605
  	if (memcg_kmem_is_active(memcg)) {
a8964b9b8   Glauber Costa   memcg: use static...
606
  		static_key_slow_dec(&memcg_kmem_enabled_key);
f3bb3043a   Vladimir Davydov   memcg: don't call...
607
  		memcg_free_cache_id(memcg->kmemcg_id);
55007d849   Glauber Costa   memcg: allocate m...
608
  	}
bea207c86   Glauber Costa   memcg: allow a me...
609
610
611
612
613
  	/*
  	 * This check can't live in kmem destruction function,
  	 * since the charges will outlive the cgroup
  	 */
  	WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
a8964b9b8   Glauber Costa   memcg: use static...
614
615
616
617
618
619
620
621
622
623
624
625
  }
  #else
  static void disarm_kmem_keys(struct mem_cgroup *memcg)
  {
  }
  #endif /* CONFIG_MEMCG_KMEM */
  
  static void disarm_static_keys(struct mem_cgroup *memcg)
  {
  	disarm_sock_keys(memcg);
  	disarm_kmem_keys(memcg);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
626
  static void drain_all_stock_async(struct mem_cgroup *memcg);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
627

f64c3f549   Balbir Singh   memory controller...
628
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
629
  mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
f64c3f549   Balbir Singh   memory controller...
630
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
631
632
  	int nid = zone_to_nid(zone);
  	int zid = zone_idx(zone);
54f72fe02   Johannes Weiner   memcg: clean up m...
633
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
634
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
635
  struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
d324236b3   Wu Fengguang   memcg: add access...
636
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
637
  	return &memcg->css;
d324236b3   Wu Fengguang   memcg: add access...
638
  }
f64c3f549   Balbir Singh   memory controller...
639
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
640
  mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
f64c3f549   Balbir Singh   memory controller...
641
  {
97a6c37b3   Johannes Weiner   memcg: change pag...
642
643
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
f64c3f549   Balbir Singh   memory controller...
644

e231875ba   Jianyu Zhan   mm: memcontrol: c...
645
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
646
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
647
648
649
650
651
652
653
654
655
656
657
658
659
660
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_node_zone(int nid, int zid)
  {
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_from_page(struct page *page)
  {
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
  
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
661
662
663
  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz,
  					 unsigned long long new_usage_in_excess)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
  {
  	struct rb_node **p = &mctz->rb_root.rb_node;
  	struct rb_node *parent = NULL;
  	struct mem_cgroup_per_zone *mz_node;
  
  	if (mz->on_tree)
  		return;
  
  	mz->usage_in_excess = new_usage_in_excess;
  	if (!mz->usage_in_excess)
  		return;
  	while (*p) {
  		parent = *p;
  		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
  					tree_node);
  		if (mz->usage_in_excess < mz_node->usage_in_excess)
  			p = &(*p)->rb_left;
  		/*
  		 * We can't avoid mem cgroups that are over their soft
  		 * limit by the same amount
  		 */
  		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
  			p = &(*p)->rb_right;
  	}
  	rb_link_node(&mz->tree_node, parent, p);
  	rb_insert_color(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = true;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
692
693
  static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
694
695
696
697
698
699
  {
  	if (!mz->on_tree)
  		return;
  	rb_erase(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = false;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
700
701
  static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  				       struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
702
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
703
704
705
  	unsigned long flags;
  
  	spin_lock_irqsave(&mctz->lock, flags);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
706
  	__mem_cgroup_remove_exceeded(mz, mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
707
  	spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
708
709
710
711
712
713
714
715
  }
  
  
  static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
  {
  	unsigned long long excess;
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
716

e231875ba   Jianyu Zhan   mm: memcontrol: c...
717
  	mctz = soft_limit_tree_from_page(page);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
718
719
720
721
722
  	/*
  	 * Necessary to update all ancestors when hierarchy is used.
  	 * because their event counter is not touched.
  	 */
  	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
723
  		mz = mem_cgroup_page_zoneinfo(memcg, page);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
724
725
726
727
728
729
  		excess = res_counter_soft_limit_excess(&memcg->res);
  		/*
  		 * We have to update the tree if mz is on RB-tree or
  		 * mem is over its softlimit.
  		 */
  		if (excess || mz->on_tree) {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
730
731
732
  			unsigned long flags;
  
  			spin_lock_irqsave(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
733
734
  			/* if on-tree, remove it */
  			if (mz->on_tree)
cf2c81279   Johannes Weiner   mm: memcontrol: r...
735
  				__mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
736
737
738
739
  			/*
  			 * Insert again. mz->usage_in_excess will be updated.
  			 * If excess is 0, no tree ops.
  			 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
740
  			__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
741
  			spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
742
743
744
745
746
747
  		}
  	}
  }
  
  static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
  {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
748
  	struct mem_cgroup_tree_per_zone *mctz;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
749
750
  	struct mem_cgroup_per_zone *mz;
  	int nid, zid;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
751

e231875ba   Jianyu Zhan   mm: memcontrol: c...
752
753
754
755
  	for_each_node(nid) {
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			mctz = soft_limit_tree_node_zone(nid, zid);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
756
  			mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
  		}
  	}
  }
  
  static struct mem_cgroup_per_zone *
  __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct rb_node *rightmost = NULL;
  	struct mem_cgroup_per_zone *mz;
  
  retry:
  	mz = NULL;
  	rightmost = rb_last(&mctz->rb_root);
  	if (!rightmost)
  		goto done;		/* Nothing to reclaim from */
  
  	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
  	/*
  	 * Remove the node now but someone else can add it back,
  	 * we will to add it back at the end of reclaim to its correct
  	 * position in the tree.
  	 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
779
  	__mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
780
  	if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
ec903c0c8   Tejun Heo   cgroup: rename cs...
781
  	    !css_tryget_online(&mz->memcg->css))
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
782
783
784
785
786
787
788
789
790
  		goto retry;
  done:
  	return mz;
  }
  
  static struct mem_cgroup_per_zone *
  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct mem_cgroup_per_zone *mz;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
791
  	spin_lock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
792
  	mz = __mem_cgroup_largest_soft_limit_node(mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
793
  	spin_unlock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
794
795
  	return mz;
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
  /*
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
   * a periodic synchronizion of counter in memcg's counter.
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
   * he accounts memory. Even if we provide quick-and-fuzzy read, we always
   * have to visit all online cpus and make sum. So, for now, unnecessary
   * synchronization is not implemented. (just implemented for cpu hotplug)
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
   * common workload, threashold and synchonization as vmstat[] should be
   * implemented.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
815
  static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
7a159cc9d   Johannes Weiner   memcg: use native...
816
  				 enum mem_cgroup_stat_index idx)
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
817
  {
7a159cc9d   Johannes Weiner   memcg: use native...
818
  	long val = 0;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
819
  	int cpu;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
820

711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
821
822
  	get_online_cpus();
  	for_each_online_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
823
  		val += per_cpu(memcg->stat->count[idx], cpu);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
824
  #ifdef CONFIG_HOTPLUG_CPU
c0ff4b854   Raghavendra K T   memcg: rename mem...
825
826
827
  	spin_lock(&memcg->pcp_counter_lock);
  	val += memcg->nocpu_base.count[idx];
  	spin_unlock(&memcg->pcp_counter_lock);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
828
829
  #endif
  	put_online_cpus();
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
830
831
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
832
  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
e9f8974f2   Johannes Weiner   memcg: break out ...
833
834
835
836
  					    enum mem_cgroup_events_index idx)
  {
  	unsigned long val = 0;
  	int cpu;
9c5675127   David Rientjes   mm, memcg: protec...
837
  	get_online_cpus();
e9f8974f2   Johannes Weiner   memcg: break out ...
838
  	for_each_online_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
839
  		val += per_cpu(memcg->stat->events[idx], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
840
  #ifdef CONFIG_HOTPLUG_CPU
c0ff4b854   Raghavendra K T   memcg: rename mem...
841
842
843
  	spin_lock(&memcg->pcp_counter_lock);
  	val += memcg->nocpu_base.events[idx];
  	spin_unlock(&memcg->pcp_counter_lock);
e9f8974f2   Johannes Weiner   memcg: break out ...
844
  #endif
9c5675127   David Rientjes   mm, memcg: protec...
845
  	put_online_cpus();
e9f8974f2   Johannes Weiner   memcg: break out ...
846
847
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
848
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
b070e65c0   David Rientjes   mm, memcg: add rs...
849
  					 struct page *page,
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
850
  					 int nr_pages)
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
851
  {
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
852
853
854
855
  	/*
  	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
  	 * counted as CACHE even if it's on ANON LRU.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
856
  	if (PageAnon(page))
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
857
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
c0ff4b854   Raghavendra K T   memcg: rename mem...
858
  				nr_pages);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
859
  	else
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
860
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
c0ff4b854   Raghavendra K T   memcg: rename mem...
861
  				nr_pages);
55e462b05   Balaji Rao   memcg: simple sta...
862

b070e65c0   David Rientjes   mm, memcg: add rs...
863
864
865
  	if (PageTransHuge(page))
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
  				nr_pages);
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
866
867
  	/* pagein of a big page is an event. So, ignore page size */
  	if (nr_pages > 0)
c0ff4b854   Raghavendra K T   memcg: rename mem...
868
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
869
  	else {
c0ff4b854   Raghavendra K T   memcg: rename mem...
870
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
871
872
  		nr_pages = -nr_pages; /* for event */
  	}
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
873

13114716c   Johannes Weiner   mm: memcg: keep r...
874
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
875
  }
e231875ba   Jianyu Zhan   mm: memcontrol: c...
876
  unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
074291fea   Konstantin Khlebnikov   mm/vmscan: replac...
877
878
879
880
881
882
  {
  	struct mem_cgroup_per_zone *mz;
  
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	return mz->lru_size[lru];
  }
e231875ba   Jianyu Zhan   mm: memcontrol: c...
883
884
885
  static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
  						  int nid,
  						  unsigned int lru_mask)
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
886
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
887
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
888
  	int zid;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
889
  	VM_BUG_ON((unsigned)nid >= nr_node_ids);
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
890

e231875ba   Jianyu Zhan   mm: memcontrol: c...
891
892
893
894
895
896
897
898
899
900
901
902
  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  		struct mem_cgroup_per_zone *mz;
  		enum lru_list lru;
  
  		for_each_lru(lru) {
  			if (!(BIT(lru) & lru_mask))
  				continue;
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			nr += mz->lru_size[lru];
  		}
  	}
  	return nr;
889976dbc   Ying Han   memcg: reclaim me...
903
  }
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
904

c0ff4b854   Raghavendra K T   memcg: rename mem...
905
  static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
906
  			unsigned int lru_mask)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
907
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
908
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
909
  	int nid;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
910

31aaea4aa   Lai Jiangshan   memcontrol: use N...
911
  	for_each_node_state(nid, N_MEMORY)
e231875ba   Jianyu Zhan   mm: memcontrol: c...
912
913
  		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
  	return nr;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
914
  }
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
915
916
  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  				       enum mem_cgroup_events_target target)
7a159cc9d   Johannes Weiner   memcg: use native...
917
918
  {
  	unsigned long val, next;
13114716c   Johannes Weiner   mm: memcg: keep r...
919
  	val = __this_cpu_read(memcg->stat->nr_page_events);
4799401fe   Steven Rostedt   memcg: Fix race c...
920
  	next = __this_cpu_read(memcg->stat->targets[target]);
7a159cc9d   Johannes Weiner   memcg: use native...
921
  	/* from time_after() in jiffies.h */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
922
923
924
925
926
  	if ((long)next - (long)val < 0) {
  		switch (target) {
  		case MEM_CGROUP_TARGET_THRESH:
  			next = val + THRESHOLDS_EVENTS_TARGET;
  			break;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
927
928
929
  		case MEM_CGROUP_TARGET_SOFTLIMIT:
  			next = val + SOFTLIMIT_EVENTS_TARGET;
  			break;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
930
931
932
933
934
935
936
937
  		case MEM_CGROUP_TARGET_NUMAINFO:
  			next = val + NUMAINFO_EVENTS_TARGET;
  			break;
  		default:
  			break;
  		}
  		__this_cpu_write(memcg->stat->targets[target], next);
  		return true;
7a159cc9d   Johannes Weiner   memcg: use native...
938
  	}
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
939
  	return false;
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
940
941
942
943
944
945
  }
  
  /*
   * Check events in order.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
946
  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
947
948
  {
  	/* threshold event is triggered in finer grain than soft limit */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
949
950
  	if (unlikely(mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_THRESH))) {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
951
  		bool do_softlimit;
82b3f2a71   Andrew Morton   mm/memcontrol.c: ...
952
  		bool do_numainfo __maybe_unused;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
953

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
954
955
  		do_softlimit = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_SOFTLIMIT);
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
956
957
958
959
  #if MAX_NUMNODES > 1
  		do_numainfo = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_NUMAINFO);
  #endif
c0ff4b854   Raghavendra K T   memcg: rename mem...
960
  		mem_cgroup_threshold(memcg);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
961
962
  		if (unlikely(do_softlimit))
  			mem_cgroup_update_tree(memcg, page);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
963
  #if MAX_NUMNODES > 1
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
964
  		if (unlikely(do_numainfo))
c0ff4b854   Raghavendra K T   memcg: rename mem...
965
  			atomic_inc(&memcg->numainfo_events);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
966
  #endif
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
967
  	}
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
968
  }
cf475ad28   Balbir Singh   cgroups: add an o...
969
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb74669   Pavel Emelianov   Memory controller...
970
  {
31a78f23b   Balbir Singh   mm owner: fix rac...
971
972
973
974
975
976
977
  	/*
  	 * mm_update_next_owner() may clear mm->owner to NULL
  	 * if it races with swapoff, page migration, etc.
  	 * So this can be called with p == NULL.
  	 */
  	if (unlikely(!p))
  		return NULL;
073219e99   Tejun Heo   cgroup: clean up ...
978
  	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
78fb74669   Pavel Emelianov   Memory controller...
979
  }
df3819754   Johannes Weiner   memcg: get_mem_cg...
980
  static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
981
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
982
  	struct mem_cgroup *memcg = NULL;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
983

54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
984
985
  	rcu_read_lock();
  	do {
6f6acb005   Michal Hocko   memcg: fix swapca...
986
987
988
989
990
991
  		/*
  		 * Page cache insertions can happen withou an
  		 * actual mm context, e.g. during disk probing
  		 * on boot, loopback IO, acct() writes etc.
  		 */
  		if (unlikely(!mm))
df3819754   Johannes Weiner   memcg: get_mem_cg...
992
  			memcg = root_mem_cgroup;
6f6acb005   Michal Hocko   memcg: fix swapca...
993
994
995
996
997
  		else {
  			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  			if (unlikely(!memcg))
  				memcg = root_mem_cgroup;
  		}
ec903c0c8   Tejun Heo   cgroup: rename cs...
998
  	} while (!css_tryget_online(&memcg->css));
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
999
  	rcu_read_unlock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1000
  	return memcg;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
1001
  }
16248d8fe   Michal Hocko   memcg: further si...
1002
1003
1004
1005
1006
1007
1008
  /*
   * Returns a next (in a pre-order walk) alive memcg (with elevated css
   * ref. count) or NULL if the whole root's subtree has been visited.
   *
   * helper function to be used by mem_cgroup_iter
   */
  static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
694fbc0fe   Andrew Morton   revert "memcg: en...
1009
  		struct mem_cgroup *last_visited)
16248d8fe   Michal Hocko   memcg: further si...
1010
  {
492eb21b9   Tejun Heo   cgroup: make hier...
1011
  	struct cgroup_subsys_state *prev_css, *next_css;
16248d8fe   Michal Hocko   memcg: further si...
1012

bd8815a6d   Tejun Heo   cgroup: make css_...
1013
  	prev_css = last_visited ? &last_visited->css : NULL;
16248d8fe   Michal Hocko   memcg: further si...
1014
  skip_node:
492eb21b9   Tejun Heo   cgroup: make hier...
1015
  	next_css = css_next_descendant_pre(prev_css, &root->css);
16248d8fe   Michal Hocko   memcg: further si...
1016
1017
1018
1019
1020
1021
1022
  
  	/*
  	 * Even if we found a group we have to make sure it is
  	 * alive. css && !memcg means that the groups should be
  	 * skipped and we should continue the tree walk.
  	 * last_visited css is safe to use because it is
  	 * protected by css_get and the tree walk is rcu safe.
0eef61566   Michal Hocko   memcg: fix css re...
1023
1024
1025
1026
1027
1028
1029
1030
  	 *
  	 * We do not take a reference on the root of the tree walk
  	 * because we might race with the root removal when it would
  	 * be the only node in the iterated hierarchy and mem_cgroup_iter
  	 * would end up in an endless loop because it expects that at
  	 * least one valid node will be returned. Root cannot disappear
  	 * because caller of the iterator should hold it already so
  	 * skipping css reference should be safe.
16248d8fe   Michal Hocko   memcg: further si...
1031
  	 */
492eb21b9   Tejun Heo   cgroup: make hier...
1032
  	if (next_css) {
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
  		struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
  
  		if (next_css == &root->css)
  			return memcg;
  
  		if (css_tryget_online(next_css)) {
  			/*
  			 * Make sure the memcg is initialized:
  			 * mem_cgroup_css_online() orders the the
  			 * initialization against setting the flag.
  			 */
  			if (smp_load_acquire(&memcg->initialized))
  				return memcg;
  			css_put(next_css);
  		}
0eef61566   Michal Hocko   memcg: fix css re...
1048
1049
1050
  
  		prev_css = next_css;
  		goto skip_node;
16248d8fe   Michal Hocko   memcg: further si...
1051
1052
1053
1054
  	}
  
  	return NULL;
  }
519ebea3b   Johannes Weiner   mm: memcontrol: f...
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
  static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
  {
  	/*
  	 * When a group in the hierarchy below root is destroyed, the
  	 * hierarchy iterator can no longer be trusted since it might
  	 * have pointed to the destroyed group.  Invalidate it.
  	 */
  	atomic_inc(&root->dead_count);
  }
  
  static struct mem_cgroup *
  mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
  		     struct mem_cgroup *root,
  		     int *sequence)
  {
  	struct mem_cgroup *position = NULL;
  	/*
  	 * A cgroup destruction happens in two stages: offlining and
  	 * release.  They are separated by a RCU grace period.
  	 *
  	 * If the iterator is valid, we may still race with an
  	 * offlining.  The RCU lock ensures the object won't be
  	 * released, tryget will fail if we lost the race.
  	 */
  	*sequence = atomic_read(&root->dead_count);
  	if (iter->last_dead_count == *sequence) {
  		smp_rmb();
  		position = iter->last_visited;
ecc736fc3   Michal Hocko   memcg: fix endles...
1083
1084
1085
1086
1087
1088
1089
1090
  
  		/*
  		 * We cannot take a reference to root because we might race
  		 * with root removal and returning NULL would end up in
  		 * an endless loop on the iterator user level when root
  		 * would be returned all the time.
  		 */
  		if (position && position != root &&
ec903c0c8   Tejun Heo   cgroup: rename cs...
1091
  		    !css_tryget_online(&position->css))
519ebea3b   Johannes Weiner   mm: memcontrol: f...
1092
1093
1094
1095
1096
1097
1098
1099
  			position = NULL;
  	}
  	return position;
  }
  
  static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
  				   struct mem_cgroup *last_visited,
  				   struct mem_cgroup *new_position,
ecc736fc3   Michal Hocko   memcg: fix endles...
1100
  				   struct mem_cgroup *root,
519ebea3b   Johannes Weiner   mm: memcontrol: f...
1101
1102
  				   int sequence)
  {
ecc736fc3   Michal Hocko   memcg: fix endles...
1103
1104
  	/* root reference counting symmetric to mem_cgroup_iter_load */
  	if (last_visited && last_visited != root)
519ebea3b   Johannes Weiner   mm: memcontrol: f...
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
  		css_put(&last_visited->css);
  	/*
  	 * We store the sequence count from the time @last_visited was
  	 * loaded successfully instead of rereading it here so that we
  	 * don't lose destruction events in between.  We could have
  	 * raced with the destruction of @new_position after all.
  	 */
  	iter->last_visited = new_position;
  	smp_wmb();
  	iter->last_dead_count = sequence;
  }
5660048cc   Johannes Weiner   mm: move memcg hi...
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
  /**
   * mem_cgroup_iter - iterate over memory cgroup hierarchy
   * @root: hierarchy root
   * @prev: previously returned memcg, NULL on first invocation
   * @reclaim: cookie for shared reclaim walks, NULL for full walks
   *
   * Returns references to children of the hierarchy below @root, or
   * @root itself, or %NULL after a full round-trip.
   *
   * Caller must pass the return value in @prev on subsequent
   * invocations for reference counting, or use mem_cgroup_iter_break()
   * to cancel a hierarchy walk before the round-trip is complete.
   *
   * Reclaimers can specify a zone and a priority level in @reclaim to
   * divide up the memcgs in the hierarchy among all concurrent
   * reclaimers operating on the same zone and priority.
   */
694fbc0fe   Andrew Morton   revert "memcg: en...
1133
  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
5660048cc   Johannes Weiner   mm: move memcg hi...
1134
  				   struct mem_cgroup *prev,
694fbc0fe   Andrew Morton   revert "memcg: en...
1135
  				   struct mem_cgroup_reclaim_cookie *reclaim)
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1136
  {
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1137
  	struct mem_cgroup *memcg = NULL;
542f85f9a   Michal Hocko   memcg: rework mem...
1138
  	struct mem_cgroup *last_visited = NULL;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1139

694fbc0fe   Andrew Morton   revert "memcg: en...
1140
1141
  	if (mem_cgroup_disabled())
  		return NULL;
5660048cc   Johannes Weiner   mm: move memcg hi...
1142

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1143
1144
  	if (!root)
  		root = root_mem_cgroup;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1145

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1146
  	if (prev && !reclaim)
542f85f9a   Michal Hocko   memcg: rework mem...
1147
  		last_visited = prev;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1148

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1149
1150
  	if (!root->use_hierarchy && root != root_mem_cgroup) {
  		if (prev)
c40046f3a   Michal Hocko   memcg: keep prev'...
1151
  			goto out_css_put;
694fbc0fe   Andrew Morton   revert "memcg: en...
1152
  		return root;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1153
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1154

542f85f9a   Michal Hocko   memcg: rework mem...
1155
  	rcu_read_lock();
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1156
  	while (!memcg) {
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1157
  		struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
519ebea3b   Johannes Weiner   mm: memcontrol: f...
1158
  		int uninitialized_var(seq);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1159

527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1160
  		if (reclaim) {
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1161
  			struct mem_cgroup_per_zone *mz;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
1162
  			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1163
  			iter = &mz->reclaim_iter[reclaim->priority];
542f85f9a   Michal Hocko   memcg: rework mem...
1164
  			if (prev && reclaim->generation != iter->generation) {
5f5781619   Michal Hocko   memcg: relax memc...
1165
  				iter->last_visited = NULL;
542f85f9a   Michal Hocko   memcg: rework mem...
1166
1167
  				goto out_unlock;
  			}
5f5781619   Michal Hocko   memcg: relax memc...
1168

519ebea3b   Johannes Weiner   mm: memcontrol: f...
1169
  			last_visited = mem_cgroup_iter_load(iter, root, &seq);
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1170
  		}
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1171

694fbc0fe   Andrew Morton   revert "memcg: en...
1172
  		memcg = __mem_cgroup_iter_next(root, last_visited);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1173

527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1174
  		if (reclaim) {
ecc736fc3   Michal Hocko   memcg: fix endles...
1175
1176
  			mem_cgroup_iter_update(iter, last_visited, memcg, root,
  					seq);
542f85f9a   Michal Hocko   memcg: rework mem...
1177

19f394028   Michal Hocko   memcg: simplify m...
1178
  			if (!memcg)
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1179
1180
1181
1182
  				iter->generation++;
  			else if (!prev && memcg)
  				reclaim->generation = iter->generation;
  		}
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1183

694fbc0fe   Andrew Morton   revert "memcg: en...
1184
  		if (prev && !memcg)
542f85f9a   Michal Hocko   memcg: rework mem...
1185
  			goto out_unlock;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1186
  	}
542f85f9a   Michal Hocko   memcg: rework mem...
1187
1188
  out_unlock:
  	rcu_read_unlock();
c40046f3a   Michal Hocko   memcg: keep prev'...
1189
1190
1191
  out_css_put:
  	if (prev && prev != root)
  		css_put(&prev->css);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1192
  	return memcg;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1193
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1194

5660048cc   Johannes Weiner   mm: move memcg hi...
1195
1196
1197
1198
1199
1200
1201
  /**
   * mem_cgroup_iter_break - abort a hierarchy walk prematurely
   * @root: hierarchy root
   * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
   */
  void mem_cgroup_iter_break(struct mem_cgroup *root,
  			   struct mem_cgroup *prev)
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1202
1203
1204
1205
1206
1207
  {
  	if (!root)
  		root = root_mem_cgroup;
  	if (prev && prev != root)
  		css_put(&prev->css);
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1208

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1209
1210
1211
1212
1213
1214
  /*
   * Iteration constructs for visiting all cgroups (under a tree).  If
   * loops are exited prematurely (break), mem_cgroup_iter_break() must
   * be used for reference counting.
   */
  #define for_each_mem_cgroup_tree(iter, root)		\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1215
  	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1216
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1217
  	     iter = mem_cgroup_iter(root, iter, NULL))
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1218

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1219
  #define for_each_mem_cgroup(iter)			\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1220
  	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1221
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1222
  	     iter = mem_cgroup_iter(NULL, iter, NULL))
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1223

68ae564bb   David Rientjes   mm, memcg: avoid ...
1224
  void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
456f998ec   Ying Han   memcg: add the pa...
1225
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1226
  	struct mem_cgroup *memcg;
456f998ec   Ying Han   memcg: add the pa...
1227

456f998ec   Ying Han   memcg: add the pa...
1228
  	rcu_read_lock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1229
1230
  	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  	if (unlikely(!memcg))
456f998ec   Ying Han   memcg: add the pa...
1231
1232
1233
  		goto out;
  
  	switch (idx) {
456f998ec   Ying Han   memcg: add the pa...
1234
  	case PGFAULT:
0e574a932   Johannes Weiner   mm: memcg: clean ...
1235
1236
1237
1238
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
  		break;
  	case PGMAJFAULT:
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
456f998ec   Ying Han   memcg: add the pa...
1239
1240
1241
1242
1243
1244
1245
  		break;
  	default:
  		BUG();
  	}
  out:
  	rcu_read_unlock();
  }
68ae564bb   David Rientjes   mm, memcg: avoid ...
1246
  EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
456f998ec   Ying Han   memcg: add the pa...
1247

925b7673c   Johannes Weiner   mm: make per-memc...
1248
1249
1250
  /**
   * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
   * @zone: zone of the wanted lruvec
fa9add641   Hugh Dickins   mm/memcg: apply a...
1251
   * @memcg: memcg of the wanted lruvec
925b7673c   Johannes Weiner   mm: make per-memc...
1252
1253
1254
1255
1256
1257
1258
1259
1260
   *
   * Returns the lru list vector holding pages for the given @zone and
   * @mem.  This can be the global zone lruvec, if the memory controller
   * is disabled.
   */
  struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
  				      struct mem_cgroup *memcg)
  {
  	struct mem_cgroup_per_zone *mz;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1261
  	struct lruvec *lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
1262

bea8c150a   Hugh Dickins   memcg: fix hotplu...
1263
1264
1265
1266
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
1267

e231875ba   Jianyu Zhan   mm: memcontrol: c...
1268
  	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
1279
  }
925b7673c   Johannes Weiner   mm: make per-memc...
1280
  /**
fa9add641   Hugh Dickins   mm/memcg: apply a...
1281
   * mem_cgroup_page_lruvec - return lruvec for adding an lru page
925b7673c   Johannes Weiner   mm: make per-memc...
1282
   * @page: the page
fa9add641   Hugh Dickins   mm/memcg: apply a...
1283
   * @zone: zone of the page
925b7673c   Johannes Weiner   mm: make per-memc...
1284
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1285
  struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1286
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1287
  	struct mem_cgroup_per_zone *mz;
925b7673c   Johannes Weiner   mm: make per-memc...
1288
1289
  	struct mem_cgroup *memcg;
  	struct page_cgroup *pc;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1290
  	struct lruvec *lruvec;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
1291

bea8c150a   Hugh Dickins   memcg: fix hotplu...
1292
1293
1294
1295
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
1296

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1297
  	pc = lookup_page_cgroup(page);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
1298
  	memcg = pc->mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1299
1300
  
  	/*
fa9add641   Hugh Dickins   mm/memcg: apply a...
1301
  	 * Surreptitiously switch any uncharged offlist page to root:
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1302
1303
1304
1305
1306
1307
1308
  	 * an uncharged page off lru does nothing to secure
  	 * its former mem_cgroup from sudden removal.
  	 *
  	 * Our caller holds lru_lock, and PageCgroupUsed is updated
  	 * under page_cgroup lock: between them, they make all uses
  	 * of pc->mem_cgroup safe.
  	 */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1309
  	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1310
  		pc->mem_cgroup = memcg = root_mem_cgroup;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
1311
  	mz = mem_cgroup_page_zoneinfo(memcg, page);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1322
  }
b69408e88   Christoph Lameter   vmscan: Use an in...
1323

925b7673c   Johannes Weiner   mm: make per-memc...
1324
  /**
fa9add641   Hugh Dickins   mm/memcg: apply a...
1325
1326
1327
1328
   * mem_cgroup_update_lru_size - account for adding or removing an lru page
   * @lruvec: mem_cgroup per zone lru vector
   * @lru: index of lru list the page is sitting on
   * @nr_pages: positive when adding or negative when removing
925b7673c   Johannes Weiner   mm: make per-memc...
1329
   *
fa9add641   Hugh Dickins   mm/memcg: apply a...
1330
1331
   * This function must be called when a page is added to or removed from an
   * lru list.
3f58a8294   Minchan Kim   memcg: move memcg...
1332
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1333
1334
  void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
  				int nr_pages)
3f58a8294   Minchan Kim   memcg: move memcg...
1335
1336
  {
  	struct mem_cgroup_per_zone *mz;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1337
  	unsigned long *lru_size;
3f58a8294   Minchan Kim   memcg: move memcg...
1338
1339
1340
  
  	if (mem_cgroup_disabled())
  		return;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1341
1342
1343
1344
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	lru_size = mz->lru_size + lru;
  	*lru_size += nr_pages;
  	VM_BUG_ON((long)(*lru_size) < 0);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1345
  }
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
1346

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1347
  /*
c0ff4b854   Raghavendra K T   memcg: rename mem...
1348
   * Checks whether given mem is same or in the root_mem_cgroup's
3e92041d6   Michal Hocko   memcg: add mem_cg...
1349
1350
   * hierarchy subtree
   */
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1351
1352
  bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
  				  struct mem_cgroup *memcg)
3e92041d6   Michal Hocko   memcg: add mem_cg...
1353
  {
91c63734f   Johannes Weiner   kernel: cgroup: p...
1354
1355
  	if (root_memcg == memcg)
  		return true;
3a981f482   Hugh Dickins   memcg: fix use_hi...
1356
  	if (!root_memcg->use_hierarchy || !memcg)
91c63734f   Johannes Weiner   kernel: cgroup: p...
1357
  		return false;
b47f77b5a   Li Zefan   memcg: convert to...
1358
  	return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1359
1360
1361
1362
1363
1364
  }
  
  static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
  				       struct mem_cgroup *memcg)
  {
  	bool ret;
91c63734f   Johannes Weiner   kernel: cgroup: p...
1365
  	rcu_read_lock();
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1366
  	ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
91c63734f   Johannes Weiner   kernel: cgroup: p...
1367
1368
  	rcu_read_unlock();
  	return ret;
3e92041d6   Michal Hocko   memcg: add mem_cg...
1369
  }
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1370
1371
  bool task_in_mem_cgroup(struct task_struct *task,
  			const struct mem_cgroup *memcg)
4c4a22148   David Rientjes   memcontrol: move ...
1372
  {
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1373
  	struct mem_cgroup *curr = NULL;
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1374
  	struct task_struct *p;
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1375
  	bool ret;
4c4a22148   David Rientjes   memcontrol: move ...
1376

158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1377
  	p = find_lock_task_mm(task);
de077d222   David Rientjes   oom, memcg: fix e...
1378
  	if (p) {
df3819754   Johannes Weiner   memcg: get_mem_cg...
1379
  		curr = get_mem_cgroup_from_mm(p->mm);
de077d222   David Rientjes   oom, memcg: fix e...
1380
1381
1382
1383
1384
1385
1386
  		task_unlock(p);
  	} else {
  		/*
  		 * All threads may have already detached their mm's, but the oom
  		 * killer still needs to detect if they have already been oom
  		 * killed to prevent needlessly killing additional tasks.
  		 */
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1387
  		rcu_read_lock();
de077d222   David Rientjes   oom, memcg: fix e...
1388
1389
1390
  		curr = mem_cgroup_from_task(task);
  		if (curr)
  			css_get(&curr->css);
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1391
  		rcu_read_unlock();
de077d222   David Rientjes   oom, memcg: fix e...
1392
  	}
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1393
  	/*
c0ff4b854   Raghavendra K T   memcg: rename mem...
1394
  	 * We should check use_hierarchy of "memcg" not "curr". Because checking
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1395
  	 * use_hierarchy of "curr" here make this function true if hierarchy is
c0ff4b854   Raghavendra K T   memcg: rename mem...
1396
1397
  	 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
  	 * hierarchy(even if use_hierarchy is disabled in "memcg").
d31f56dbf   Daisuke Nishimura   memcg: avoid oom-...
1398
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1399
  	ret = mem_cgroup_same_or_subtree(memcg, curr);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1400
  	css_put(&curr->css);
4c4a22148   David Rientjes   memcontrol: move ...
1401
1402
  	return ret;
  }
c56d5c7df   Konstantin Khlebnikov   mm/vmscan: push l...
1403
  int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
14797e236   KOSAKI Motohiro   memcg: add inacti...
1404
  {
9b272977e   Johannes Weiner   memcg: skip scann...
1405
  	unsigned long inactive_ratio;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1406
  	unsigned long inactive;
9b272977e   Johannes Weiner   memcg: skip scann...
1407
  	unsigned long active;
c772be939   KOSAKI Motohiro   memcg: fix calcul...
1408
  	unsigned long gb;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1409

4d7dcca21   Hugh Dickins   mm/memcg: get_lru...
1410
1411
  	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
  	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
14797e236   KOSAKI Motohiro   memcg: add inacti...
1412

c772be939   KOSAKI Motohiro   memcg: fix calcul...
1413
1414
1415
1416
1417
  	gb = (inactive + active) >> (30 - PAGE_SHIFT);
  	if (gb)
  		inactive_ratio = int_sqrt(10 * gb);
  	else
  		inactive_ratio = 1;
9b272977e   Johannes Weiner   memcg: skip scann...
1418
  	return inactive * inactive_ratio < active;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1419
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1420
1421
  #define mem_cgroup_from_res_counter(counter, member)	\
  	container_of(counter, struct mem_cgroup, member)
19942822d   Johannes Weiner   memcg: prevent en...
1422
  /**
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1423
   * mem_cgroup_margin - calculate chargeable space of a memory cgroup
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1424
   * @memcg: the memory cgroup
19942822d   Johannes Weiner   memcg: prevent en...
1425
   *
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1426
   * Returns the maximum amount of memory @mem can be charged with, in
7ec99d621   Johannes Weiner   memcg: unify char...
1427
   * pages.
19942822d   Johannes Weiner   memcg: prevent en...
1428
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1429
  static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
19942822d   Johannes Weiner   memcg: prevent en...
1430
  {
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1431
  	unsigned long long margin;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1432
  	margin = res_counter_margin(&memcg->res);
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1433
  	if (do_swap_account)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1434
  		margin = min(margin, res_counter_margin(&memcg->memsw));
7ec99d621   Johannes Weiner   memcg: unify char...
1435
  	return margin >> PAGE_SHIFT;
19942822d   Johannes Weiner   memcg: prevent en...
1436
  }
1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
1437
  int mem_cgroup_swappiness(struct mem_cgroup *memcg)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1438
  {
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1439
  	/* root ? */
14208b0ec   Linus Torvalds   Merge branch 'for...
1440
  	if (mem_cgroup_disabled() || !memcg->css.parent)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1441
  		return vm_swappiness;
bf1ff2635   Johannes Weiner   memcg: remove mem...
1442
  	return memcg->swappiness;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1443
  }
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
  /*
   * memcg->moving_account is used for checking possibility that some thread is
   * calling move_account(). When a thread on CPU-A starts moving pages under
   * a memcg, other threads should check memcg->moving_account under
   * rcu_read_lock(), like this:
   *
   *         CPU-A                                    CPU-B
   *                                              rcu_read_lock()
   *         memcg->moving_account+1              if (memcg->mocing_account)
   *                                                   take heavy locks.
   *         synchronize_rcu()                    update something.
   *                                              rcu_read_unlock()
   *         start move here.
   */
4331f7d33   KAMEZAWA Hiroyuki   memcg: fix perfor...
1458

c0ff4b854   Raghavendra K T   memcg: rename mem...
1459
  static void mem_cgroup_start_move(struct mem_cgroup *memcg)
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1460
  {
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1461
  	atomic_inc(&memcg->moving_account);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1462
1463
  	synchronize_rcu();
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1464
  static void mem_cgroup_end_move(struct mem_cgroup *memcg)
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1465
  {
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1466
1467
1468
1469
  	/*
  	 * Now, mem_cgroup_clear_mc() may call this function with NULL.
  	 * We check NULL in callee rather than caller.
  	 */
d7365e783   Johannes Weiner   mm: memcontrol: f...
1470
  	if (memcg)
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1471
  		atomic_dec(&memcg->moving_account);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1472
  }
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1473

32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1474
  /*
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1475
   * A routine for checking "mem" is under move_account() or not.
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1476
   *
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1477
1478
1479
   * Checking a cgroup is mc.from or mc.to or under hierarchy of
   * moving cgroups. This is for waiting at high-memory pressure
   * caused by "move".
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1480
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1481
  static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1482
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1483
1484
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1485
  	bool ret = false;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1486
1487
1488
1489
1490
1491
1492
1493
1494
  	/*
  	 * Unlike task_move routines, we access mc.to, mc.from not under
  	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
  	 */
  	spin_lock(&mc.lock);
  	from = mc.from;
  	to = mc.to;
  	if (!from)
  		goto unlock;
3e92041d6   Michal Hocko   memcg: add mem_cg...
1495

c0ff4b854   Raghavendra K T   memcg: rename mem...
1496
1497
  	ret = mem_cgroup_same_or_subtree(memcg, from)
  		|| mem_cgroup_same_or_subtree(memcg, to);
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1498
1499
  unlock:
  	spin_unlock(&mc.lock);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1500
1501
  	return ret;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1502
  static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1503
1504
  {
  	if (mc.moving_task && current != mc.moving_task) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1505
  		if (mem_cgroup_under_move(memcg)) {
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
  			DEFINE_WAIT(wait);
  			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
  			/* moving charge context might have finished. */
  			if (mc.moving_task)
  				schedule();
  			finish_wait(&mc.waitq, &wait);
  			return true;
  		}
  	}
  	return false;
  }
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
1517
1518
1519
1520
  /*
   * Take this lock when
   * - a code tries to modify page's memcg while it's USED.
   * - a code tries to modify page state accounting in a memcg.
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
   */
  static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
  				  unsigned long *flags)
  {
  	spin_lock_irqsave(&memcg->move_lock, *flags);
  }
  
  static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
  				unsigned long *flags)
  {
  	spin_unlock_irqrestore(&memcg->move_lock, *flags);
  }
58cf188ed   Sha Zhengju   memcg, oom: provi...
1533
  #define K(x) ((x) << (PAGE_SHIFT-10))
e222432bf   Balbir Singh   memcg: show memcg...
1534
  /**
58cf188ed   Sha Zhengju   memcg, oom: provi...
1535
   * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
e222432bf   Balbir Singh   memcg: show memcg...
1536
1537
1538
1539
1540
1541
1542
1543
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
   * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
   * enabled
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
e61734c55   Tejun Heo   cgroup: remove cg...
1544
  	/* oom_info_lock ensures that parallel ooms do not interleave */
08088cb9a   Michal Hocko   memcg: change oom...
1545
  	static DEFINE_MUTEX(oom_info_lock);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1546
1547
  	struct mem_cgroup *iter;
  	unsigned int i;
e222432bf   Balbir Singh   memcg: show memcg...
1548

58cf188ed   Sha Zhengju   memcg, oom: provi...
1549
  	if (!p)
e222432bf   Balbir Singh   memcg: show memcg...
1550
  		return;
08088cb9a   Michal Hocko   memcg: change oom...
1551
  	mutex_lock(&oom_info_lock);
e222432bf   Balbir Singh   memcg: show memcg...
1552
  	rcu_read_lock();
e61734c55   Tejun Heo   cgroup: remove cg...
1553
1554
1555
1556
1557
1558
  	pr_info("Task in ");
  	pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
  	pr_info(" killed as a result of limit of ");
  	pr_cont_cgroup_path(memcg->css.cgroup);
  	pr_info("
  ");
e222432bf   Balbir Singh   memcg: show memcg...
1559

e222432bf   Balbir Singh   memcg: show memcg...
1560
  	rcu_read_unlock();
d045197ff   Andrew Morton   mm/memcontrol.c: ...
1561
1562
  	pr_info("memory: usage %llukB, limit %llukB, failcnt %llu
  ",
e222432bf   Balbir Singh   memcg: show memcg...
1563
1564
1565
  		res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
  		res_counter_read_u64(&memcg->res, RES_FAILCNT));
d045197ff   Andrew Morton   mm/memcontrol.c: ...
1566
1567
  	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu
  ",
e222432bf   Balbir Singh   memcg: show memcg...
1568
1569
1570
  		res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
  		res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
d045197ff   Andrew Morton   mm/memcontrol.c: ...
1571
1572
  	pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu
  ",
510fc4e11   Glauber Costa   memcg: kmem accou...
1573
1574
1575
  		res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
  		res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
  		res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
58cf188ed   Sha Zhengju   memcg, oom: provi...
1576
1577
  
  	for_each_mem_cgroup_tree(iter, memcg) {
e61734c55   Tejun Heo   cgroup: remove cg...
1578
1579
  		pr_info("Memory cgroup stats for ");
  		pr_cont_cgroup_path(iter->css.cgroup);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
  		pr_cont(":");
  
  		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
  			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
  				continue;
  			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
  				K(mem_cgroup_read_stat(iter, i)));
  		}
  
  		for (i = 0; i < NR_LRU_LISTS; i++)
  			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
  				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
  
  		pr_cont("
  ");
  	}
08088cb9a   Michal Hocko   memcg: change oom...
1596
  	mutex_unlock(&oom_info_lock);
e222432bf   Balbir Singh   memcg: show memcg...
1597
  }
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1598
1599
1600
1601
  /*
   * This function returns the number of memcg under hierarchy tree. Returns
   * 1(self count) if no children.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1602
  static int mem_cgroup_count_children(struct mem_cgroup *memcg)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1603
1604
  {
  	int num = 0;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1605
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1606
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1607
  		num++;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1608
1609
  	return num;
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1610
  /*
a63d83f42   David Rientjes   oom: badness heur...
1611
1612
   * Return the memory (and swap, if configured) limit for a memcg.
   */
9cbb78bb3   David Rientjes   mm, memcg: introd...
1613
  static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
a63d83f42   David Rientjes   oom: badness heur...
1614
1615
  {
  	u64 limit;
a63d83f42   David Rientjes   oom: badness heur...
1616

f3e8eb70b   Johannes Weiner   memcg: fix unit m...
1617
  	limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
f3e8eb70b   Johannes Weiner   memcg: fix unit m...
1618

a63d83f42   David Rientjes   oom: badness heur...
1619
  	/*
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1620
  	 * Do not consider swap space if we cannot swap due to swappiness
a63d83f42   David Rientjes   oom: badness heur...
1621
  	 */
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
  	if (mem_cgroup_swappiness(memcg)) {
  		u64 memsw;
  
  		limit += total_swap_pages << PAGE_SHIFT;
  		memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  
  		/*
  		 * If memsw is finite and limits the amount of swap space
  		 * available to this memcg, return that limit.
  		 */
  		limit = min(limit, memsw);
  	}
  
  	return limit;
a63d83f42   David Rientjes   oom: badness heur...
1636
  }
19965460e   David Rientjes   mm, memcg: make m...
1637
1638
  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
  				     int order)
9cbb78bb3   David Rientjes   mm, memcg: introd...
1639
1640
1641
1642
1643
1644
  {
  	struct mem_cgroup *iter;
  	unsigned long chosen_points = 0;
  	unsigned long totalpages;
  	unsigned int points = 0;
  	struct task_struct *chosen = NULL;
876aafbfd   David Rientjes   mm, memcg: move a...
1645
  	/*
465adcf1e   David Rientjes   mm, memcg: give e...
1646
1647
1648
  	 * If current has a pending SIGKILL or is exiting, then automatically
  	 * select it.  The goal is to allow it to allocate so that it may
  	 * quickly exit and free its memory.
876aafbfd   David Rientjes   mm, memcg: move a...
1649
  	 */
465adcf1e   David Rientjes   mm, memcg: give e...
1650
  	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
876aafbfd   David Rientjes   mm, memcg: move a...
1651
1652
1653
1654
1655
  		set_thread_flag(TIF_MEMDIE);
  		return;
  	}
  
  	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1656
1657
  	totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
  	for_each_mem_cgroup_tree(iter, memcg) {
72ec70299   Tejun Heo   cgroup: make task...
1658
  		struct css_task_iter it;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1659
  		struct task_struct *task;
72ec70299   Tejun Heo   cgroup: make task...
1660
1661
  		css_task_iter_start(&iter->css, &it);
  		while ((task = css_task_iter_next(&it))) {
9cbb78bb3   David Rientjes   mm, memcg: introd...
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
  			switch (oom_scan_process_thread(task, totalpages, NULL,
  							false)) {
  			case OOM_SCAN_SELECT:
  				if (chosen)
  					put_task_struct(chosen);
  				chosen = task;
  				chosen_points = ULONG_MAX;
  				get_task_struct(chosen);
  				/* fall through */
  			case OOM_SCAN_CONTINUE:
  				continue;
  			case OOM_SCAN_ABORT:
72ec70299   Tejun Heo   cgroup: make task...
1674
  				css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1675
1676
1677
1678
1679
1680
1681
1682
  				mem_cgroup_iter_break(memcg, iter);
  				if (chosen)
  					put_task_struct(chosen);
  				return;
  			case OOM_SCAN_OK:
  				break;
  			};
  			points = oom_badness(task, memcg, NULL, totalpages);
d49ad9355   David Rientjes   mm, oom: prefer t...
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
  			if (!points || points < chosen_points)
  				continue;
  			/* Prefer thread group leaders for display purposes */
  			if (points == chosen_points &&
  			    thread_group_leader(chosen))
  				continue;
  
  			if (chosen)
  				put_task_struct(chosen);
  			chosen = task;
  			chosen_points = points;
  			get_task_struct(chosen);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1695
  		}
72ec70299   Tejun Heo   cgroup: make task...
1696
  		css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1697
1698
1699
1700
1701
  	}
  
  	if (!chosen)
  		return;
  	points = chosen_points * 1000 / totalpages;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1702
1703
  	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
  			 NULL, "Memory cgroup out of memory");
9cbb78bb3   David Rientjes   mm, memcg: introd...
1704
  }
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1705
1706
  /**
   * test_mem_cgroup_node_reclaimable
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1707
   * @memcg: the target memcg
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1708
1709
1710
1711
1712
1713
1714
   * @nid: the node ID to be checked.
   * @noswap : specify true here if the user wants flle only information.
   *
   * This function returns whether the specified memcg contains any
   * reclaimable pages on a node. Returns true if there are any reclaimable
   * pages in the node.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1715
  static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1716
1717
  		int nid, bool noswap)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1718
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1719
1720
1721
  		return true;
  	if (noswap || !total_swap_pages)
  		return false;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1722
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1723
1724
1725
1726
  		return true;
  	return false;
  
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
1727
  #if MAX_NUMNODES > 1
889976dbc   Ying Han   memcg: reclaim me...
1728
1729
1730
1731
1732
1733
1734
  
  /*
   * Always updating the nodemask is not very good - even if we have an empty
   * list or the wrong list here, we can start from some node and traverse all
   * nodes based on the zonelist. So update the list loosely once per 10 secs.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1735
  static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1736
1737
  {
  	int nid;
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1738
1739
1740
1741
  	/*
  	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
  	 * pagein/pageout changes since the last update.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1742
  	if (!atomic_read(&memcg->numainfo_events))
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1743
  		return;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1744
  	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
889976dbc   Ying Han   memcg: reclaim me...
1745
  		return;
889976dbc   Ying Han   memcg: reclaim me...
1746
  	/* make a nodemask where this memcg uses memory from */
31aaea4aa   Lai Jiangshan   memcontrol: use N...
1747
  	memcg->scan_nodes = node_states[N_MEMORY];
889976dbc   Ying Han   memcg: reclaim me...
1748

31aaea4aa   Lai Jiangshan   memcontrol: use N...
1749
  	for_each_node_mask(nid, node_states[N_MEMORY]) {
889976dbc   Ying Han   memcg: reclaim me...
1750

c0ff4b854   Raghavendra K T   memcg: rename mem...
1751
1752
  		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
  			node_clear(nid, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1753
  	}
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1754

c0ff4b854   Raghavendra K T   memcg: rename mem...
1755
1756
  	atomic_set(&memcg->numainfo_events, 0);
  	atomic_set(&memcg->numainfo_updating, 0);
889976dbc   Ying Han   memcg: reclaim me...
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
  }
  
  /*
   * Selecting a node where we start reclaim from. Because what we need is just
   * reducing usage counter, start from anywhere is O,K. Considering
   * memory reclaim from current node, there are pros. and cons.
   *
   * Freeing memory from current node means freeing memory from a node which
   * we'll use or we've used. So, it may make LRU bad. And if several threads
   * hit limits, it will see a contention on a node. But freeing from remote
   * node means more costs for memory reclaim because of memory latency.
   *
   * Now, we use round-robin. Better algorithm is welcomed.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1771
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1772
1773
  {
  	int node;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1774
1775
  	mem_cgroup_may_update_nodemask(memcg);
  	node = memcg->last_scanned_node;
889976dbc   Ying Han   memcg: reclaim me...
1776

c0ff4b854   Raghavendra K T   memcg: rename mem...
1777
  	node = next_node(node, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1778
  	if (node == MAX_NUMNODES)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1779
  		node = first_node(memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1780
1781
1782
1783
1784
1785
1786
1787
  	/*
  	 * We call this when we hit limit, not when pages are added to LRU.
  	 * No LRU may hold pages because all pages are UNEVICTABLE or
  	 * memcg is too small and all pages are not on LRU. In that case,
  	 * we use curret node.
  	 */
  	if (unlikely(node == MAX_NUMNODES))
  		node = numa_node_id();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1788
  	memcg->last_scanned_node = node;
889976dbc   Ying Han   memcg: reclaim me...
1789
1790
  	return node;
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
  /*
   * Check all nodes whether it contains reclaimable pages or not.
   * For quick scan, we make use of scan_nodes. This will allow us to skip
   * unused nodes. But scan_nodes is lazily updated and may not cotain
   * enough new information. We need to do double check.
   */
  static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
  {
  	int nid;
  
  	/*
  	 * quick check...making use of scan_node.
  	 * We can skip unused nodes.
  	 */
  	if (!nodes_empty(memcg->scan_nodes)) {
  		for (nid = first_node(memcg->scan_nodes);
  		     nid < MAX_NUMNODES;
  		     nid = next_node(nid, memcg->scan_nodes)) {
  
  			if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
  				return true;
  		}
  	}
  	/*
  	 * Check rest of nodes.
  	 */
  	for_each_node_state(nid, N_MEMORY) {
  		if (node_isset(nid, memcg->scan_nodes))
  			continue;
  		if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
  			return true;
  	}
  	return false;
  }
889976dbc   Ying Han   memcg: reclaim me...
1825
  #else
c0ff4b854   Raghavendra K T   memcg: rename mem...
1826
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1827
1828
1829
  {
  	return 0;
  }
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1830

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
1831
1832
1833
1834
  static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
  {
  	return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
  }
889976dbc   Ying Han   memcg: reclaim me...
1835
  #endif
0608f43da   Andrew Morton   revert "memcg, vm...
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
  static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
  				   struct zone *zone,
  				   gfp_t gfp_mask,
  				   unsigned long *total_scanned)
  {
  	struct mem_cgroup *victim = NULL;
  	int total = 0;
  	int loop = 0;
  	unsigned long excess;
  	unsigned long nr_scanned;
  	struct mem_cgroup_reclaim_cookie reclaim = {
  		.zone = zone,
  		.priority = 0,
  	};
  
  	excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
  
  	while (1) {
  		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
  		if (!victim) {
  			loop++;
  			if (loop >= 2) {
  				/*
  				 * If we have not been able to reclaim
  				 * anything, it might because there are
  				 * no reclaimable pages under this hierarchy
  				 */
  				if (!total)
  					break;
  				/*
  				 * We want to do more targeted reclaim.
  				 * excess >> 2 is not to excessive so as to
  				 * reclaim too much, nor too less that we keep
  				 * coming back to reclaim from this cgroup
  				 */
  				if (total >= (excess >> 2) ||
  					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
  					break;
  			}
  			continue;
  		}
  		if (!mem_cgroup_reclaimable(victim, false))
  			continue;
  		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
  						     zone, &nr_scanned);
  		*total_scanned += nr_scanned;
  		if (!res_counter_soft_limit_excess(&root_memcg->res))
  			break;
6d61ef409   Balbir Singh   memcg: memory cgr...
1884
  	}
0608f43da   Andrew Morton   revert "memcg, vm...
1885
1886
  	mem_cgroup_iter_break(root_memcg, victim);
  	return total;
6d61ef409   Balbir Singh   memcg: memory cgr...
1887
  }
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1888
1889
1890
1891
1892
  #ifdef CONFIG_LOCKDEP
  static struct lockdep_map memcg_oom_lock_dep_map = {
  	.name = "memcg_oom_lock",
  };
  #endif
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1893
  static DEFINE_SPINLOCK(memcg_oom_lock);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1894
1895
1896
1897
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
   */
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1898
  static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1899
  {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1900
  	struct mem_cgroup *iter, *failed = NULL;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1901

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1902
  	spin_lock(&memcg_oom_lock);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1903
  	for_each_mem_cgroup_tree(iter, memcg) {
23751be00   Johannes Weiner   memcg: fix hierar...
1904
  		if (iter->oom_lock) {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1905
1906
1907
1908
  			/*
  			 * this subtree of our hierarchy is already locked
  			 * so we cannot give a lock.
  			 */
79dfdaccd   Michal Hocko   memcg: make oom_l...
1909
  			failed = iter;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1910
1911
  			mem_cgroup_iter_break(memcg, iter);
  			break;
23751be00   Johannes Weiner   memcg: fix hierar...
1912
1913
  		} else
  			iter->oom_lock = true;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1914
  	}
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1915

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
  	if (failed) {
  		/*
  		 * OK, we failed to lock the whole subtree so we have
  		 * to clean up what we set up to the failing subtree
  		 */
  		for_each_mem_cgroup_tree(iter, memcg) {
  			if (iter == failed) {
  				mem_cgroup_iter_break(memcg, iter);
  				break;
  			}
  			iter->oom_lock = false;
79dfdaccd   Michal Hocko   memcg: make oom_l...
1927
  		}
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1928
1929
  	} else
  		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1930
1931
1932
1933
  
  	spin_unlock(&memcg_oom_lock);
  
  	return !failed;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1934
  }
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1935

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1936
  static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1937
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1938
  	struct mem_cgroup *iter;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1939
  	spin_lock(&memcg_oom_lock);
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1940
  	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1941
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1942
  		iter->oom_lock = false;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1943
  	spin_unlock(&memcg_oom_lock);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1944
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1945
  static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1946
1947
  {
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1948
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1949
1950
  		atomic_inc(&iter->under_oom);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1951
  static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1952
1953
  {
  	struct mem_cgroup *iter;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1954
1955
1956
1957
1958
  	/*
  	 * When a new child is created while the hierarchy is under oom,
  	 * mem_cgroup_oom_lock() may not be called. We have to use
  	 * atomic_add_unless() here.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1959
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1960
  		atomic_add_unless(&iter->under_oom, -1, 0);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1961
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1962
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1963
  struct oom_wait_info {
d79154bb5   Hugh Dickins   memcg: replace me...
1964
  	struct mem_cgroup *memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1965
1966
1967
1968
1969
1970
  	wait_queue_t	wait;
  };
  
  static int memcg_oom_wake_function(wait_queue_t *wait,
  	unsigned mode, int sync, void *arg)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
1971
1972
  	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
  	struct mem_cgroup *oom_wait_memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1973
1974
1975
  	struct oom_wait_info *oom_wait_info;
  
  	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
d79154bb5   Hugh Dickins   memcg: replace me...
1976
  	oom_wait_memcg = oom_wait_info->memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1977

dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1978
  	/*
d79154bb5   Hugh Dickins   memcg: replace me...
1979
  	 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1980
1981
  	 * Then we can use css_is_ancestor without taking care of RCU.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1982
1983
  	if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
  		&& !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1984
  		return 0;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1985
1986
  	return autoremove_wake_function(wait, mode, sync, arg);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1987
  static void memcg_wakeup_oom(struct mem_cgroup *memcg)
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1988
  {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1989
  	atomic_inc(&memcg->oom_wakeups);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1990
1991
  	/* for filtering, pass "memcg" as argument. */
  	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1992
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1993
  static void memcg_oom_recover(struct mem_cgroup *memcg)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1994
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1995
1996
  	if (memcg && atomic_read(&memcg->under_oom))
  		memcg_wakeup_oom(memcg);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1997
  }
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1998
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1999
  {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2000
2001
  	if (!current->memcg_oom.may_oom)
  		return;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2002
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
  	 * We are in the middle of the charge context here, so we
  	 * don't want to block when potentially sitting on a callstack
  	 * that holds all kinds of filesystem and mm locks.
  	 *
  	 * Also, the caller may handle a failed allocation gracefully
  	 * (like optional page cache readahead) and so an OOM killer
  	 * invocation might not even be necessary.
  	 *
  	 * That's why we don't do anything here except remember the
  	 * OOM context and then deal with it at the end of the page
  	 * fault when the stack is unwound, the locks are released,
  	 * and when we know whether the fault was overall successful.
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2015
  	 */
494264208   Johannes Weiner   mm: memcg: handle...
2016
2017
2018
2019
  	css_get(&memcg->css);
  	current->memcg_oom.memcg = memcg;
  	current->memcg_oom.gfp_mask = mask;
  	current->memcg_oom.order = order;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2020
2021
2022
2023
  }
  
  /**
   * mem_cgroup_oom_synchronize - complete memcg OOM handling
494264208   Johannes Weiner   mm: memcg: handle...
2024
   * @handle: actually kill/wait or just clean up the OOM state
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2025
   *
494264208   Johannes Weiner   mm: memcg: handle...
2026
2027
   * This has to be called at the end of a page fault if the memcg OOM
   * handler was enabled.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2028
   *
494264208   Johannes Weiner   mm: memcg: handle...
2029
   * Memcg supports userspace OOM handling where failed allocations must
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2030
2031
2032
2033
   * sleep on a waitqueue until the userspace task resolves the
   * situation.  Sleeping directly in the charge context with all kinds
   * of locks held is not a good idea, instead we remember an OOM state
   * in the task and mem_cgroup_oom_synchronize() has to be called at
494264208   Johannes Weiner   mm: memcg: handle...
2034
   * the end of the page fault to complete the OOM handling.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2035
2036
   *
   * Returns %true if an ongoing memcg OOM situation was detected and
494264208   Johannes Weiner   mm: memcg: handle...
2037
   * completed, %false otherwise.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2038
   */
494264208   Johannes Weiner   mm: memcg: handle...
2039
  bool mem_cgroup_oom_synchronize(bool handle)
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2040
  {
494264208   Johannes Weiner   mm: memcg: handle...
2041
  	struct mem_cgroup *memcg = current->memcg_oom.memcg;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2042
  	struct oom_wait_info owait;
494264208   Johannes Weiner   mm: memcg: handle...
2043
  	bool locked;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2044
2045
  
  	/* OOM is global, do not handle */
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2046
  	if (!memcg)
494264208   Johannes Weiner   mm: memcg: handle...
2047
  		return false;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2048

494264208   Johannes Weiner   mm: memcg: handle...
2049
2050
  	if (!handle)
  		goto cleanup;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2051
2052
2053
2054
2055
2056
  
  	owait.memcg = memcg;
  	owait.wait.flags = 0;
  	owait.wait.func = memcg_oom_wake_function;
  	owait.wait.private = current;
  	INIT_LIST_HEAD(&owait.wait.task_list);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2057

3812c8c8f   Johannes Weiner   mm: memcg: do not...
2058
  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
494264208   Johannes Weiner   mm: memcg: handle...
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
  	mem_cgroup_mark_under_oom(memcg);
  
  	locked = mem_cgroup_oom_trylock(memcg);
  
  	if (locked)
  		mem_cgroup_oom_notify(memcg);
  
  	if (locked && !memcg->oom_kill_disable) {
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
  		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
  					 current->memcg_oom.order);
  	} else {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2072
  		schedule();
494264208   Johannes Weiner   mm: memcg: handle...
2073
2074
2075
2076
2077
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
  	}
  
  	if (locked) {
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
2078
2079
2080
2081
2082
2083
2084
2085
  		mem_cgroup_oom_unlock(memcg);
  		/*
  		 * There is no guarantee that an OOM-lock contender
  		 * sees the wakeups triggered by the OOM kill
  		 * uncharges.  Wake any sleepers explicitely.
  		 */
  		memcg_oom_recover(memcg);
  	}
494264208   Johannes Weiner   mm: memcg: handle...
2086
2087
  cleanup:
  	current->memcg_oom.memcg = NULL;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
2088
  	css_put(&memcg->css);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2089
  	return true;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
2090
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
2091
2092
2093
2094
2095
  /**
   * mem_cgroup_begin_page_stat - begin a page state statistics transaction
   * @page: page that is going to change accounted state
   * @locked: &memcg->move_lock slowpath was taken
   * @flags: IRQ-state flags for &memcg->move_lock
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
2096
   *
d7365e783   Johannes Weiner   mm: memcontrol: f...
2097
2098
2099
   * This function must mark the beginning of an accounted page state
   * change to prevent double accounting when the page is concurrently
   * being moved to another memcg:
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
2100
   *
d7365e783   Johannes Weiner   mm: memcontrol: f...
2101
2102
2103
2104
   *   memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
   *   if (TestClearPageState(page))
   *     mem_cgroup_update_page_stat(memcg, state, -1);
   *   mem_cgroup_end_page_stat(memcg, locked, flags);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
2105
   *
d7365e783   Johannes Weiner   mm: memcontrol: f...
2106
2107
2108
   * The RCU lock is held throughout the transaction.  The fast path can
   * get away without acquiring the memcg->move_lock (@locked is false)
   * because page moving starts with an RCU grace period.
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
2109
   *
d7365e783   Johannes Weiner   mm: memcontrol: f...
2110
2111
2112
2113
2114
   * The RCU lock also protects the memcg from being freed when the page
   * state that is going to change is the only thing preventing the page
   * from being uncharged.  E.g. end-writeback clearing PageWriteback(),
   * which allows migration to go ahead and uncharge the page before the
   * account transaction might be complete.
d69b042f3   Balbir Singh   memcg: add file-b...
2115
   */
d7365e783   Johannes Weiner   mm: memcontrol: f...
2116
2117
2118
  struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
  					      bool *locked,
  					      unsigned long *flags)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
2119
2120
2121
  {
  	struct mem_cgroup *memcg;
  	struct page_cgroup *pc;
d7365e783   Johannes Weiner   mm: memcontrol: f...
2122
2123
2124
2125
  	rcu_read_lock();
  
  	if (mem_cgroup_disabled())
  		return NULL;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
2126
2127
2128
2129
  	pc = lookup_page_cgroup(page);
  again:
  	memcg = pc->mem_cgroup;
  	if (unlikely(!memcg || !PageCgroupUsed(pc)))
d7365e783   Johannes Weiner   mm: memcontrol: f...
2130
2131
2132
  		return NULL;
  
  	*locked = false;
bdcbb659f   Qiang Huang   memcg: fold mem_c...
2133
  	if (atomic_read(&memcg->moving_account) <= 0)
d7365e783   Johannes Weiner   mm: memcontrol: f...
2134
  		return memcg;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
2135
2136
2137
2138
2139
2140
2141
  
  	move_lock_mem_cgroup(memcg, flags);
  	if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
  		move_unlock_mem_cgroup(memcg, flags);
  		goto again;
  	}
  	*locked = true;
d7365e783   Johannes Weiner   mm: memcontrol: f...
2142
2143
  
  	return memcg;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
2144
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
2145
2146
2147
2148
2149
2150
2151
2152
  /**
   * mem_cgroup_end_page_stat - finish a page state statistics transaction
   * @memcg: the memcg that was accounted against
   * @locked: value received from mem_cgroup_begin_page_stat()
   * @flags: value received from mem_cgroup_begin_page_stat()
   */
  void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
  			      unsigned long flags)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
2153
  {
d7365e783   Johannes Weiner   mm: memcontrol: f...
2154
2155
  	if (memcg && locked)
  		move_unlock_mem_cgroup(memcg, &flags);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
2156

d7365e783   Johannes Weiner   mm: memcontrol: f...
2157
  	rcu_read_unlock();
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
2158
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
2159
2160
2161
2162
2163
2164
2165
2166
2167
  /**
   * mem_cgroup_update_page_stat - update page state statistics
   * @memcg: memcg to account against
   * @idx: page state item to account
   * @val: number of pages (positive or negative)
   *
   * See mem_cgroup_begin_page_stat() for locking requirements.
   */
  void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
68b4876d9   Sha Zhengju   memcg: remove MEM...
2168
  				 enum mem_cgroup_stat_index idx, int val)
d69b042f3   Balbir Singh   memcg: add file-b...
2169
  {
658b72c5a   Sha Zhengju   memcg: check for ...
2170
  	VM_BUG_ON(!rcu_read_lock_held());
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
2171

d7365e783   Johannes Weiner   mm: memcontrol: f...
2172
2173
  	if (memcg)
  		this_cpu_add(memcg->stat->count[idx], val);
d69b042f3   Balbir Singh   memcg: add file-b...
2174
  }
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
2175

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2176
  /*
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2177
2178
2179
   * size of first charge trial. "32" comes from vmscan.c's magic value.
   * TODO: maybe necessary to use big numbers in big irons.
   */
7ec99d621   Johannes Weiner   memcg: unify char...
2180
  #define CHARGE_BATCH	32U
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2181
2182
  struct memcg_stock_pcp {
  	struct mem_cgroup *cached; /* this never be root cgroup */
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2183
  	unsigned int nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2184
  	struct work_struct work;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2185
  	unsigned long flags;
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
2186
  #define FLUSHING_CACHED_CHARGE	0
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2187
2188
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
9f50fad65   Michal Hocko   Revert "memcg: ge...
2189
  static DEFINE_MUTEX(percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2190

a0956d544   Suleiman Souhlal   memcg: make it po...
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
  /**
   * consume_stock: Try to consume stocked charge on this cpu.
   * @memcg: memcg to consume from.
   * @nr_pages: how many pages to charge.
   *
   * The charges will only happen if @memcg matches the current cpu's memcg
   * stock, and at least @nr_pages are available in that stock.  Failure to
   * service an allocation will refill the stock.
   *
   * returns true if successful, false otherwise.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2201
   */
a0956d544   Suleiman Souhlal   memcg: make it po...
2202
  static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2203
2204
2205
  {
  	struct memcg_stock_pcp *stock;
  	bool ret = true;
a0956d544   Suleiman Souhlal   memcg: make it po...
2206
2207
  	if (nr_pages > CHARGE_BATCH)
  		return false;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2208
  	stock = &get_cpu_var(memcg_stock);
a0956d544   Suleiman Souhlal   memcg: make it po...
2209
2210
  	if (memcg == stock->cached && stock->nr_pages >= nr_pages)
  		stock->nr_pages -= nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
  	else /* need to call res_counter_charge */
  		ret = false;
  	put_cpu_var(memcg_stock);
  	return ret;
  }
  
  /*
   * Returns stocks cached in percpu to res_counter and reset cached information.
   */
  static void drain_stock(struct memcg_stock_pcp *stock)
  {
  	struct mem_cgroup *old = stock->cached;
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2223
2224
2225
2226
  	if (stock->nr_pages) {
  		unsigned long bytes = stock->nr_pages * PAGE_SIZE;
  
  		res_counter_uncharge(&old->res, bytes);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2227
  		if (do_swap_account)
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2228
2229
  			res_counter_uncharge(&old->memsw, bytes);
  		stock->nr_pages = 0;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2230
2231
  	}
  	stock->cached = NULL;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2232
2233
2234
2235
2236
2237
2238
2239
  }
  
  /*
   * This must be called under preempt disabled or must be called by
   * a thread which is pinned to local cpu.
   */
  static void drain_local_stock(struct work_struct *dummy)
  {
7c8e0181e   Christoph Lameter   mm: replace __get...
2240
  	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2241
  	drain_stock(stock);
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2242
  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2243
  }
e47774962   Michal Hocko   memcg: move memcg...
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
  static void __init memcg_stock_init(void)
  {
  	int cpu;
  
  	for_each_possible_cpu(cpu) {
  		struct memcg_stock_pcp *stock =
  					&per_cpu(memcg_stock, cpu);
  		INIT_WORK(&stock->work, drain_local_stock);
  	}
  }
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2254
2255
  /*
   * Cache charges(val) which is from res_counter, to local per_cpu area.
320cc51d9   Greg Thelen   mm: fix typo in r...
2256
   * This will be consumed by consume_stock() function, later.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2257
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2258
  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2259
2260
  {
  	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2261
  	if (stock->cached != memcg) { /* reset if necessary */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2262
  		drain_stock(stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2263
  		stock->cached = memcg;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2264
  	}
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2265
  	stock->nr_pages += nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2266
2267
2268
2269
  	put_cpu_var(memcg_stock);
  }
  
  /*
c0ff4b854   Raghavendra K T   memcg: rename mem...
2270
   * Drains all per-CPU charge caches for given root_memcg resp. subtree
d38144b7a   Michal Hocko   memcg: unify sync...
2271
2272
   * of the hierarchy under it. sync flag says whether we should block
   * until the work is done.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2273
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2274
  static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2275
  {
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2276
  	int cpu, curcpu;
d38144b7a   Michal Hocko   memcg: unify sync...
2277

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2278
  	/* Notify other cpus that system-wide "drain" is running */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2279
  	get_online_cpus();
5af12d0ef   Johannes Weiner   memcg: pin execut...
2280
  	curcpu = get_cpu();
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2281
2282
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2283
  		struct mem_cgroup *memcg;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2284

c0ff4b854   Raghavendra K T   memcg: rename mem...
2285
2286
  		memcg = stock->cached;
  		if (!memcg || !stock->nr_pages)
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2287
  			continue;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2288
  		if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
3e92041d6   Michal Hocko   memcg: add mem_cg...
2289
  			continue;
d1a05b697   Michal Hocko   memcg: do not try...
2290
2291
2292
2293
2294
2295
  		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
  			if (cpu == curcpu)
  				drain_local_stock(&stock->work);
  			else
  				schedule_work_on(cpu, &stock->work);
  		}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2296
  	}
5af12d0ef   Johannes Weiner   memcg: pin execut...
2297
  	put_cpu();
d38144b7a   Michal Hocko   memcg: unify sync...
2298
2299
2300
2301
2302
2303
  
  	if (!sync)
  		goto out;
  
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
9f50fad65   Michal Hocko   Revert "memcg: ge...
2304
  		if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
d38144b7a   Michal Hocko   memcg: unify sync...
2305
2306
2307
  			flush_work(&stock->work);
  	}
  out:
f894ffa86   Andrew Morton   memcg: trivial cl...
2308
  	put_online_cpus();
d38144b7a   Michal Hocko   memcg: unify sync...
2309
2310
2311
2312
2313
2314
2315
2316
  }
  
  /*
   * Tries to drain stocked charges in other cpus. This function is asynchronous
   * and just put a work per cpu for draining localy on each cpu. Caller can
   * expects some charges will be back to res_counter later but cannot wait for
   * it.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2317
  static void drain_all_stock_async(struct mem_cgroup *root_memcg)
d38144b7a   Michal Hocko   memcg: unify sync...
2318
  {
9f50fad65   Michal Hocko   Revert "memcg: ge...
2319
2320
2321
2322
2323
  	/*
  	 * If someone calls draining, avoid adding more kworker runs.
  	 */
  	if (!mutex_trylock(&percpu_charge_mutex))
  		return;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2324
  	drain_all_stock(root_memcg, false);
9f50fad65   Michal Hocko   Revert "memcg: ge...
2325
  	mutex_unlock(&percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2326
2327
2328
  }
  
  /* This is a synchronous drain interface. */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2329
  static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2330
2331
  {
  	/* called when force_empty is called */
9f50fad65   Michal Hocko   Revert "memcg: ge...
2332
  	mutex_lock(&percpu_charge_mutex);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2333
  	drain_all_stock(root_memcg, true);
9f50fad65   Michal Hocko   Revert "memcg: ge...
2334
  	mutex_unlock(&percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2335
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2336
2337
2338
2339
  /*
   * This function drains percpu counter value from DEAD cpu and
   * move it to local cpu. Note that this function can be preempted.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2340
  static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2341
2342
  {
  	int i;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2343
  	spin_lock(&memcg->pcp_counter_lock);
6104621de   Johannes Weiner   mm: memcg: remove...
2344
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2345
  		long x = per_cpu(memcg->stat->count[i], cpu);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2346

c0ff4b854   Raghavendra K T   memcg: rename mem...
2347
2348
  		per_cpu(memcg->stat->count[i], cpu) = 0;
  		memcg->nocpu_base.count[i] += x;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2349
  	}
e9f8974f2   Johannes Weiner   memcg: break out ...
2350
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2351
  		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
2352

c0ff4b854   Raghavendra K T   memcg: rename mem...
2353
2354
  		per_cpu(memcg->stat->events[i], cpu) = 0;
  		memcg->nocpu_base.events[i] += x;
e9f8974f2   Johannes Weiner   memcg: break out ...
2355
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2356
  	spin_unlock(&memcg->pcp_counter_lock);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2357
  }
0db0628d9   Paul Gortmaker   kernel: delete __...
2358
  static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2359
2360
2361
2362
2363
  					unsigned long action,
  					void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
  	struct memcg_stock_pcp *stock;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2364
  	struct mem_cgroup *iter;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2365

619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
2366
  	if (action == CPU_ONLINE)
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2367
  		return NOTIFY_OK;
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2368

d833049bd   Kirill A. Shutemov   memcg: fix broken...
2369
  	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2370
  		return NOTIFY_OK;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2371

9f3a0d093   Johannes Weiner   mm: memcg: consol...
2372
  	for_each_mem_cgroup(iter)
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2373
  		mem_cgroup_drain_pcp_counter(iter, cpu);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2374
2375
2376
2377
  	stock = &per_cpu(memcg_stock, cpu);
  	drain_stock(stock);
  	return NOTIFY_OK;
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
2378
2379
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
  		      unsigned int nr_pages)
8a9f3ccd2   Balbir Singh   Memory controller...
2380
  {
7ec99d621   Johannes Weiner   memcg: unify char...
2381
  	unsigned int batch = max(CHARGE_BATCH, nr_pages);
9b1306192   Johannes Weiner   mm: memcontrol: r...
2382
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2383
2384
2385
  	struct mem_cgroup *mem_over_limit;
  	struct res_counter *fail_res;
  	unsigned long nr_reclaimed;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2386
  	unsigned long long size;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2387
2388
  	bool may_swap = true;
  	bool drained = false;
05b843012   Johannes Weiner   mm: memcontrol: u...
2389
  	int ret = 0;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
2390

ce00a9673   Johannes Weiner   mm: memcontrol: r...
2391
2392
  	if (mem_cgroup_is_root(memcg))
  		goto done;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2393
  retry:
b6b6cc72b   Michal Hocko   memcg: do not rep...
2394
2395
  	if (consume_stock(memcg, nr_pages))
  		goto done;
8a9f3ccd2   Balbir Singh   Memory controller...
2396

6539cc053   Johannes Weiner   mm: memcontrol: f...
2397
  	size = batch * PAGE_SIZE;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2398
2399
2400
  	if (!do_swap_account ||
  	    !res_counter_charge(&memcg->memsw, size, &fail_res)) {
  		if (!res_counter_charge(&memcg->res, size, &fail_res))
6539cc053   Johannes Weiner   mm: memcontrol: f...
2401
  			goto done_restock;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2402
2403
2404
2405
  		if (do_swap_account)
  			res_counter_uncharge(&memcg->memsw, size);
  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
  	} else {
6539cc053   Johannes Weiner   mm: memcontrol: f...
2406
  		mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2407
  		may_swap = false;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2408
  	}
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2409

6539cc053   Johannes Weiner   mm: memcontrol: f...
2410
2411
2412
2413
  	if (batch > nr_pages) {
  		batch = nr_pages;
  		goto retry;
  	}
6d61ef409   Balbir Singh   memcg: memory cgr...
2414

06b078fc0   Johannes Weiner   mm: memcontrol: r...
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
  	/*
  	 * Unlike in global OOM situations, memcg is not in a physical
  	 * memory shortage.  Allow dying and OOM-killed tasks to
  	 * bypass the last charges so that they can exit quickly and
  	 * free their memory.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
  		     fatal_signal_pending(current) ||
  		     current->flags & PF_EXITING))
  		goto bypass;
  
  	if (unlikely(task_in_memcg_oom(current)))
  		goto nomem;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2428
2429
  	if (!(gfp_mask & __GFP_WAIT))
  		goto nomem;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2430

b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2431
2432
  	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
  						    gfp_mask, may_swap);
6539cc053   Johannes Weiner   mm: memcontrol: f...
2433

61e02c745   Johannes Weiner   mm: memcontrol: c...
2434
  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
6539cc053   Johannes Weiner   mm: memcontrol: f...
2435
  		goto retry;
28c34c291   Johannes Weiner   mm: memcontrol: r...
2436

b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2437
2438
2439
2440
2441
  	if (!drained) {
  		drain_all_stock_async(mem_over_limit);
  		drained = true;
  		goto retry;
  	}
28c34c291   Johannes Weiner   mm: memcontrol: r...
2442
2443
  	if (gfp_mask & __GFP_NORETRY)
  		goto nomem;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2444
2445
2446
2447
2448
2449
2450
2451
2452
  	/*
  	 * Even though the limit is exceeded at this point, reclaim
  	 * may have been able to free some pages.  Retry the charge
  	 * before killing the task.
  	 *
  	 * Only for regular pages, though: huge pages are rather
  	 * unlikely to succeed so close to the limit, and we fall back
  	 * to regular pages anyway in case of failure.
  	 */
61e02c745   Johannes Weiner   mm: memcontrol: c...
2453
  	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
6539cc053   Johannes Weiner   mm: memcontrol: f...
2454
2455
2456
2457
2458
2459
2460
  		goto retry;
  	/*
  	 * At task move, charge accounts can be doubly counted. So, it's
  	 * better to wait until the end of task_move if something is going on.
  	 */
  	if (mem_cgroup_wait_acct_move(mem_over_limit))
  		goto retry;
9b1306192   Johannes Weiner   mm: memcontrol: r...
2461
2462
  	if (nr_retries--)
  		goto retry;
06b078fc0   Johannes Weiner   mm: memcontrol: r...
2463
2464
  	if (gfp_mask & __GFP_NOFAIL)
  		goto bypass;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2465
2466
  	if (fatal_signal_pending(current))
  		goto bypass;
61e02c745   Johannes Weiner   mm: memcontrol: c...
2467
  	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2468
  nomem:
6d1fdc489   Johannes Weiner   memcg: sanitize _...
2469
  	if (!(gfp_mask & __GFP_NOFAIL))
3168ecbe1   Johannes Weiner   mm: memcg: use pr...
2470
  		return -ENOMEM;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2471
  bypass:
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2472
  	return -EINTR;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2473
2474
2475
2476
2477
  
  done_restock:
  	if (batch > nr_pages)
  		refill_stock(memcg, batch - nr_pages);
  done:
05b843012   Johannes Weiner   mm: memcontrol: u...
2478
  	return ret;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2479
  }
8a9f3ccd2   Balbir Singh   Memory controller...
2480

00501b531   Johannes Weiner   mm: memcontrol: r...
2481
  static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2482
  {
05b843012   Johannes Weiner   mm: memcontrol: u...
2483
  	unsigned long bytes = nr_pages * PAGE_SIZE;
e7018b8d2   Johannes Weiner   memcg: keep only ...
2484

ce00a9673   Johannes Weiner   mm: memcontrol: r...
2485
2486
  	if (mem_cgroup_is_root(memcg))
  		return;
05b843012   Johannes Weiner   mm: memcontrol: u...
2487
2488
2489
  	res_counter_uncharge(&memcg->res, bytes);
  	if (do_swap_account)
  		res_counter_uncharge(&memcg->memsw, bytes);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
2490
  }
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2491
  /*
d01dd17f1   KAMEZAWA Hiroyuki   memcg: use res_co...
2492
2493
2494
2495
2496
2497
2498
   * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
   * This is useful when moving usage to parent cgroup.
   */
  static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
  					unsigned int nr_pages)
  {
  	unsigned long bytes = nr_pages * PAGE_SIZE;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2499
2500
  	if (mem_cgroup_is_root(memcg))
  		return;
d01dd17f1   KAMEZAWA Hiroyuki   memcg: use res_co...
2501
2502
2503
2504
2505
2506
2507
  	res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
  	if (do_swap_account)
  		res_counter_uncharge_until(&memcg->memsw,
  						memcg->memsw.parent, bytes);
  }
  
  /*
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2508
   * A helper function to get mem_cgroup from ID. must be called under
ec903c0c8   Tejun Heo   cgroup: rename cs...
2509
2510
2511
   * rcu_read_lock().  The caller is responsible for calling
   * css_tryget_online() if the mem_cgroup is used for charging. (dropping
   * refcnt from swap can be called against removed memcg.)
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2512
2513
2514
   */
  static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
  {
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2515
2516
2517
  	/* ID 0 is unused ID */
  	if (!id)
  		return NULL;
34c00c319   Li Zefan   memcg: convert to...
2518
  	return mem_cgroup_from_id(id);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2519
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
  /*
   * try_get_mem_cgroup_from_page - look up page's memcg association
   * @page: the page
   *
   * Look up, get a css reference, and return the memcg that owns @page.
   *
   * The page must be locked to prevent racing with swap-in and page
   * cache charges.  If coming from an unlocked page table, the caller
   * must ensure the page is on the LRU or this can race with charging.
   */
e42d9d5d4   Wu Fengguang   memcg: rename and...
2530
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2531
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2532
  	struct mem_cgroup *memcg = NULL;
3c776e646   Daisuke Nishimura   memcg: charge swa...
2533
  	struct page_cgroup *pc;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2534
  	unsigned short id;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2535
  	swp_entry_t ent;
309381fea   Sasha Levin   mm: dump page whe...
2536
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
3c776e646   Daisuke Nishimura   memcg: charge swa...
2537

3c776e646   Daisuke Nishimura   memcg: charge swa...
2538
  	pc = lookup_page_cgroup(page);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2539
  	if (PageCgroupUsed(pc)) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2540
  		memcg = pc->mem_cgroup;
ec903c0c8   Tejun Heo   cgroup: rename cs...
2541
  		if (memcg && !css_tryget_online(&memcg->css))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2542
  			memcg = NULL;
e42d9d5d4   Wu Fengguang   memcg: rename and...
2543
  	} else if (PageSwapCache(page)) {
3c776e646   Daisuke Nishimura   memcg: charge swa...
2544
  		ent.val = page_private(page);
9fb4b7cc0   Bob Liu   page_cgroup: add ...
2545
  		id = lookup_swap_cgroup_id(ent);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2546
  		rcu_read_lock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
2547
  		memcg = mem_cgroup_lookup(id);
ec903c0c8   Tejun Heo   cgroup: rename cs...
2548
  		if (memcg && !css_tryget_online(&memcg->css))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2549
  			memcg = NULL;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2550
  		rcu_read_unlock();
3c776e646   Daisuke Nishimura   memcg: charge swa...
2551
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2552
  	return memcg;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2553
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
  static void lock_page_lru(struct page *page, int *isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	spin_lock_irq(&zone->lru_lock);
  	if (PageLRU(page)) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		ClearPageLRU(page);
  		del_page_from_lru_list(page, lruvec, page_lru(page));
  		*isolated = 1;
  	} else
  		*isolated = 0;
  }
  
  static void unlock_page_lru(struct page *page, int isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	if (isolated) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		SetPageLRU(page);
  		add_page_to_lru_list(page, lruvec, page_lru(page));
  	}
  	spin_unlock_irq(&zone->lru_lock);
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
2584
  static void commit_charge(struct page *page, struct mem_cgroup *memcg,
6abb5a867   Johannes Weiner   mm: memcontrol: a...
2585
  			  bool lrucare)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2586
  {
ce587e65e   Johannes Weiner   mm: memcg: move p...
2587
  	struct page_cgroup *pc = lookup_page_cgroup(page);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2588
  	int isolated;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2589

309381fea   Sasha Levin   mm: dump page whe...
2590
  	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2591
2592
2593
2594
  	/*
  	 * we don't need page_cgroup_lock about tail pages, becase they are not
  	 * accessed by any other context at this point.
  	 */
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2595
2596
2597
2598
2599
  
  	/*
  	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
  	 * may already be on some other mem_cgroup's LRU.  Take care of it.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2600
2601
  	if (lrucare)
  		lock_page_lru(page, &isolated);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2602

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
  	/*
  	 * Nobody should be changing or seriously looking at
  	 * pc->mem_cgroup and pc->flags at this point:
  	 *
  	 * - the page is uncharged
  	 *
  	 * - the page is off-LRU
  	 *
  	 * - an anonymous fault has exclusive page access, except for
  	 *   a locked page table
  	 *
  	 * - a page cache insertion, a swapin fault, or a migration
  	 *   have the page locked
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2617
  	pc->mem_cgroup = memcg;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2618
  	pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2619

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2620
2621
  	if (lrucare)
  		unlock_page_lru(page, isolated);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2622
  }
66e1707bc   Balbir Singh   Memory controller...
2623

7cf279824   Glauber Costa   memcg/sl[au]b: tr...
2624
  static DEFINE_MUTEX(set_limit_mutex);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2625
  #ifdef CONFIG_MEMCG_KMEM
bd6731458   Vladimir Davydov   memcg, slab: simp...
2626
2627
2628
2629
2630
  /*
   * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
   * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
   */
  static DEFINE_MUTEX(memcg_slab_mutex);
d64416377   Vladimir Davydov   memcg: rework mem...
2631
  static DEFINE_MUTEX(activate_kmem_mutex);
1f458cbf1   Glauber Costa   memcg: destroy me...
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
  /*
   * This is a bit cumbersome, but it is rarely used and avoids a backpointer
   * in the memcg_cache_params struct.
   */
  static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
  {
  	struct kmem_cache *cachep;
  
  	VM_BUG_ON(p->is_root_cache);
  	cachep = p->root_cache;
7a67d7abc   Qiang Huang   memcg, kmem: use ...
2642
  	return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
1f458cbf1   Glauber Costa   memcg: destroy me...
2643
  }
749c54151   Glauber Costa   memcg: aggregate ...
2644
  #ifdef CONFIG_SLABINFO
2da8ca822   Tejun Heo   cgroup: replace c...
2645
  static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
749c54151   Glauber Costa   memcg: aggregate ...
2646
  {
2da8ca822   Tejun Heo   cgroup: replace c...
2647
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
749c54151   Glauber Costa   memcg: aggregate ...
2648
  	struct memcg_cache_params *params;
cf2b8fbf1   Vladimir Davydov   memcg: zap memcg_...
2649
  	if (!memcg_kmem_is_active(memcg))
749c54151   Glauber Costa   memcg: aggregate ...
2650
2651
2652
  		return -EIO;
  
  	print_slabinfo_header(m);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2653
  	mutex_lock(&memcg_slab_mutex);
749c54151   Glauber Costa   memcg: aggregate ...
2654
2655
  	list_for_each_entry(params, &memcg->memcg_slab_caches, list)
  		cache_show(memcg_params_to_cache(params), m);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2656
  	mutex_unlock(&memcg_slab_mutex);
749c54151   Glauber Costa   memcg: aggregate ...
2657
2658
2659
2660
  
  	return 0;
  }
  #endif
c67a8a685   Vladimir Davydov   memcg, slab: merg...
2661
  static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2662
2663
  {
  	struct res_counter *fail_res;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2664
  	int ret = 0;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2665
2666
2667
2668
  
  	ret = res_counter_charge(&memcg->kmem, size, &fail_res);
  	if (ret)
  		return ret;
00501b531   Johannes Weiner   mm: memcontrol: r...
2669
  	ret = try_charge(memcg, gfp, size >> PAGE_SHIFT);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2670
2671
  	if (ret == -EINTR)  {
  		/*
00501b531   Johannes Weiner   mm: memcontrol: r...
2672
2673
2674
2675
2676
2677
  		 * try_charge() chose to bypass to root due to OOM kill or
  		 * fatal signal.  Since our only options are to either fail
  		 * the allocation or charge it to this cgroup, do it as a
  		 * temporary condition. But we can't fail. From a kmem/slab
  		 * perspective, the cache has already been selected, by
  		 * mem_cgroup_kmem_get_cache(), so it is too late to change
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2678
2679
2680
  		 * our minds.
  		 *
  		 * This condition will only trigger if the task entered
00501b531   Johannes Weiner   mm: memcontrol: r...
2681
2682
2683
  		 * memcg_charge_kmem in a sane state, but was OOM-killed
  		 * during try_charge() above. Tasks that were already dying
  		 * when the allocation triggers should have been already
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
  		 * directed to the root cgroup in memcontrol.h
  		 */
  		res_counter_charge_nofail(&memcg->res, size, &fail_res);
  		if (do_swap_account)
  			res_counter_charge_nofail(&memcg->memsw, size,
  						  &fail_res);
  		ret = 0;
  	} else if (ret)
  		res_counter_uncharge(&memcg->kmem, size);
  
  	return ret;
  }
c67a8a685   Vladimir Davydov   memcg, slab: merg...
2696
  static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2697
  {
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2698
2699
2700
  	res_counter_uncharge(&memcg->res, size);
  	if (do_swap_account)
  		res_counter_uncharge(&memcg->memsw, size);
7de37682b   Glauber Costa   memcg: kmem accou...
2701
2702
2703
2704
  
  	/* Not down to 0 */
  	if (res_counter_uncharge(&memcg->kmem, size))
  		return;
10d5ebf40   Li Zefan   memcg: use css_ge...
2705
2706
2707
2708
2709
2710
2711
2712
  	/*
  	 * Releases a reference taken in kmem_cgroup_css_offline in case
  	 * this last uncharge is racing with the offlining code or it is
  	 * outliving the memcg existence.
  	 *
  	 * The memory barrier imposed by test&clear is paired with the
  	 * explicit one in memcg_kmem_mark_dead().
  	 */
7de37682b   Glauber Costa   memcg: kmem accou...
2713
  	if (memcg_kmem_test_and_clear_dead(memcg))
10d5ebf40   Li Zefan   memcg: use css_ge...
2714
  		css_put(&memcg->css);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2715
  }
2633d7a02   Glauber Costa   slab/slub: consid...
2716
2717
2718
2719
2720
2721
2722
2723
2724
  /*
   * helper for acessing a memcg's index. It will be used as an index in the
   * child cache array in kmem_cache, and also to derive its name. This function
   * will return -1 when this is not a kmem-limited memcg.
   */
  int memcg_cache_id(struct mem_cgroup *memcg)
  {
  	return memcg ? memcg->kmemcg_id : -1;
  }
f3bb3043a   Vladimir Davydov   memcg: don't call...
2725
  static int memcg_alloc_cache_id(void)
55007d849   Glauber Costa   memcg: allocate m...
2726
  {
f3bb3043a   Vladimir Davydov   memcg: don't call...
2727
2728
2729
2730
2731
2732
2733
  	int id, size;
  	int err;
  
  	id = ida_simple_get(&kmem_limited_groups,
  			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
  	if (id < 0)
  		return id;
55007d849   Glauber Costa   memcg: allocate m...
2734

f3bb3043a   Vladimir Davydov   memcg: don't call...
2735
2736
2737
2738
2739
2740
2741
2742
2743
  	if (id < memcg_limited_groups_array_size)
  		return id;
  
  	/*
  	 * There's no space for the new id in memcg_caches arrays,
  	 * so we have to grow them.
  	 */
  
  	size = 2 * (id + 1);
55007d849   Glauber Costa   memcg: allocate m...
2744
2745
2746
2747
  	if (size < MEMCG_CACHES_MIN_SIZE)
  		size = MEMCG_CACHES_MIN_SIZE;
  	else if (size > MEMCG_CACHES_MAX_SIZE)
  		size = MEMCG_CACHES_MAX_SIZE;
f3bb3043a   Vladimir Davydov   memcg: don't call...
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
  	mutex_lock(&memcg_slab_mutex);
  	err = memcg_update_all_caches(size);
  	mutex_unlock(&memcg_slab_mutex);
  
  	if (err) {
  		ida_simple_remove(&kmem_limited_groups, id);
  		return err;
  	}
  	return id;
  }
  
  static void memcg_free_cache_id(int id)
  {
  	ida_simple_remove(&kmem_limited_groups, id);
55007d849   Glauber Costa   memcg: allocate m...
2762
2763
2764
2765
2766
2767
2768
2769
2770
  }
  
  /*
   * We should update the current array size iff all caches updates succeed. This
   * can only be done from the slab side. The slab mutex needs to be held when
   * calling this.
   */
  void memcg_update_array_size(int num)
  {
f3bb3043a   Vladimir Davydov   memcg: don't call...
2771
  	memcg_limited_groups_array_size = num;
55007d849   Glauber Costa   memcg: allocate m...
2772
  }
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2773
2774
  static void memcg_register_cache(struct mem_cgroup *memcg,
  				 struct kmem_cache *root_cache)
2633d7a02   Glauber Costa   slab/slub: consid...
2775
  {
93f39eea9   Vladimir Davydov   memcg: memcg_kmem...
2776
2777
  	static char memcg_name_buf[NAME_MAX + 1]; /* protected by
  						     memcg_slab_mutex */
bd6731458   Vladimir Davydov   memcg, slab: simp...
2778
  	struct kmem_cache *cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2779
  	int id;
bd6731458   Vladimir Davydov   memcg, slab: simp...
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
  	lockdep_assert_held(&memcg_slab_mutex);
  
  	id = memcg_cache_id(memcg);
  
  	/*
  	 * Since per-memcg caches are created asynchronously on first
  	 * allocation (see memcg_kmem_get_cache()), several threads can try to
  	 * create the same cache, but only one of them may succeed.
  	 */
  	if (cache_from_memcg_idx(root_cache, id))
1aa132542   Vladimir Davydov   memcg, slab: clea...
2790
  		return;
073ee1c6c   Vladimir Davydov   memcg: get rid of...
2791
  	cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2792
  	cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
2edefe115   Vladimir Davydov   memcg, slab: fix ...
2793
  	/*
bd6731458   Vladimir Davydov   memcg, slab: simp...
2794
2795
2796
  	 * If we could not create a memcg cache, do not complain, because
  	 * that's not critical at all as we can always proceed with the root
  	 * cache.
2edefe115   Vladimir Davydov   memcg, slab: fix ...
2797
  	 */
bd6731458   Vladimir Davydov   memcg, slab: simp...
2798
2799
  	if (!cachep)
  		return;
2edefe115   Vladimir Davydov   memcg, slab: fix ...
2800

33a690c45   Vladimir Davydov   memcg: move memcg...
2801
  	css_get(&memcg->css);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2802
  	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
1aa132542   Vladimir Davydov   memcg, slab: clea...
2803

d7f25f8a2   Glauber Costa   memcg: infrastruc...
2804
  	/*
959c8963f   Vladimir Davydov   memcg, slab: fix ...
2805
2806
2807
  	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
  	 * barrier here to ensure nobody will see the kmem_cache partially
  	 * initialized.
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2808
  	 */
959c8963f   Vladimir Davydov   memcg, slab: fix ...
2809
  	smp_wmb();
bd6731458   Vladimir Davydov   memcg, slab: simp...
2810
2811
  	BUG_ON(root_cache->memcg_params->memcg_caches[id]);
  	root_cache->memcg_params->memcg_caches[id] = cachep;
1aa132542   Vladimir Davydov   memcg, slab: clea...
2812
  }
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2813

776ed0f03   Vladimir Davydov   memcg: cleanup km...
2814
  static void memcg_unregister_cache(struct kmem_cache *cachep)
1aa132542   Vladimir Davydov   memcg, slab: clea...
2815
  {
bd6731458   Vladimir Davydov   memcg, slab: simp...
2816
  	struct kmem_cache *root_cache;
1aa132542   Vladimir Davydov   memcg, slab: clea...
2817
2818
  	struct mem_cgroup *memcg;
  	int id;
bd6731458   Vladimir Davydov   memcg, slab: simp...
2819
  	lockdep_assert_held(&memcg_slab_mutex);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2820

bd6731458   Vladimir Davydov   memcg, slab: simp...
2821
  	BUG_ON(is_root_cache(cachep));
2edefe115   Vladimir Davydov   memcg, slab: fix ...
2822

bd6731458   Vladimir Davydov   memcg, slab: simp...
2823
2824
  	root_cache = cachep->memcg_params->root_cache;
  	memcg = cachep->memcg_params->memcg;
96403da24   Vladimir Davydov   memcg: fix possib...
2825
  	id = memcg_cache_id(memcg);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2826

bd6731458   Vladimir Davydov   memcg, slab: simp...
2827
2828
  	BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
  	root_cache->memcg_params->memcg_caches[id] = NULL;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2829

bd6731458   Vladimir Davydov   memcg, slab: simp...
2830
2831
2832
  	list_del(&cachep->memcg_params->list);
  
  	kmem_cache_destroy(cachep);
33a690c45   Vladimir Davydov   memcg: move memcg...
2833
2834
2835
  
  	/* drop the reference taken in memcg_register_cache */
  	css_put(&memcg->css);
2633d7a02   Glauber Costa   slab/slub: consid...
2836
  }
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
  /*
   * During the creation a new cache, we need to disable our accounting mechanism
   * altogether. This is true even if we are not creating, but rather just
   * enqueing new caches to be created.
   *
   * This is because that process will trigger allocations; some visible, like
   * explicit kmallocs to auxiliary data structures, name strings and internal
   * cache structures; some well concealed, like INIT_WORK() that can allocate
   * objects during debug.
   *
   * If any allocation happens during memcg_kmem_get_cache, we will recurse back
   * to it. This may not be a bounded recursion: since the first cache creation
   * failed to complete (waiting on the allocation), we'll just try to create the
   * cache again, failing at the same point.
   *
   * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
   * memcg_kmem_skip_account. So we enclose anything that might allocate memory
   * inside the following two functions.
   */
  static inline void memcg_stop_kmem_account(void)
  {
  	VM_BUG_ON(!current->mm);
  	current->memcg_kmem_skip_account++;
  }
  
  static inline void memcg_resume_kmem_account(void)
  {
  	VM_BUG_ON(!current->mm);
  	current->memcg_kmem_skip_account--;
  }
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2867
  int __memcg_cleanup_cache_params(struct kmem_cache *s)
7cf279824   Glauber Costa   memcg/sl[au]b: tr...
2868
2869
  {
  	struct kmem_cache *c;
b8529907b   Vladimir Davydov   memcg, slab: do n...
2870
  	int i, failed = 0;
7cf279824   Glauber Costa   memcg/sl[au]b: tr...
2871

bd6731458   Vladimir Davydov   memcg, slab: simp...
2872
  	mutex_lock(&memcg_slab_mutex);
7a67d7abc   Qiang Huang   memcg, kmem: use ...
2873
2874
  	for_each_memcg_cache_index(i) {
  		c = cache_from_memcg_idx(s, i);
7cf279824   Glauber Costa   memcg/sl[au]b: tr...
2875
2876
  		if (!c)
  			continue;
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2877
  		memcg_unregister_cache(c);
b8529907b   Vladimir Davydov   memcg, slab: do n...
2878
2879
2880
  
  		if (cache_from_memcg_idx(s, i))
  			failed++;
7cf279824   Glauber Costa   memcg/sl[au]b: tr...
2881
  	}
bd6731458   Vladimir Davydov   memcg, slab: simp...
2882
  	mutex_unlock(&memcg_slab_mutex);
b8529907b   Vladimir Davydov   memcg, slab: do n...
2883
  	return failed;
7cf279824   Glauber Costa   memcg/sl[au]b: tr...
2884
  }
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2885
  static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
1f458cbf1   Glauber Costa   memcg: destroy me...
2886
2887
  {
  	struct kmem_cache *cachep;
bd6731458   Vladimir Davydov   memcg, slab: simp...
2888
  	struct memcg_cache_params *params, *tmp;
1f458cbf1   Glauber Costa   memcg: destroy me...
2889
2890
2891
  
  	if (!memcg_kmem_is_active(memcg))
  		return;
bd6731458   Vladimir Davydov   memcg, slab: simp...
2892
2893
  	mutex_lock(&memcg_slab_mutex);
  	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
1f458cbf1   Glauber Costa   memcg: destroy me...
2894
  		cachep = memcg_params_to_cache(params);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2895
2896
  		kmem_cache_shrink(cachep);
  		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2897
  			memcg_unregister_cache(cachep);
1f458cbf1   Glauber Costa   memcg: destroy me...
2898
  	}
bd6731458   Vladimir Davydov   memcg, slab: simp...
2899
  	mutex_unlock(&memcg_slab_mutex);
1f458cbf1   Glauber Costa   memcg: destroy me...
2900
  }
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2901
  struct memcg_register_cache_work {
5722d094a   Vladimir Davydov   memcg, slab: clea...
2902
2903
2904
2905
  	struct mem_cgroup *memcg;
  	struct kmem_cache *cachep;
  	struct work_struct work;
  };
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2906
  static void memcg_register_cache_func(struct work_struct *w)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2907
  {
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2908
2909
  	struct memcg_register_cache_work *cw =
  		container_of(w, struct memcg_register_cache_work, work);
5722d094a   Vladimir Davydov   memcg, slab: clea...
2910
2911
  	struct mem_cgroup *memcg = cw->memcg;
  	struct kmem_cache *cachep = cw->cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2912

bd6731458   Vladimir Davydov   memcg, slab: simp...
2913
  	mutex_lock(&memcg_slab_mutex);
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2914
  	memcg_register_cache(memcg, cachep);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2915
  	mutex_unlock(&memcg_slab_mutex);
5722d094a   Vladimir Davydov   memcg, slab: clea...
2916
  	css_put(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2917
2918
2919
2920
2921
  	kfree(cw);
  }
  
  /*
   * Enqueue the creation of a per-memcg kmem_cache.
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2922
   */
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2923
2924
  static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
  					    struct kmem_cache *cachep)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2925
  {
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2926
  	struct memcg_register_cache_work *cw;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2927

776ed0f03   Vladimir Davydov   memcg: cleanup km...
2928
  	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
ca0dde971   Li Zefan   memcg: take refer...
2929
2930
  	if (cw == NULL) {
  		css_put(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2931
2932
2933
2934
2935
  		return;
  	}
  
  	cw->memcg = memcg;
  	cw->cachep = cachep;
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2936
  	INIT_WORK(&cw->work, memcg_register_cache_func);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2937
2938
  	schedule_work(&cw->work);
  }
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2939
2940
  static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
  					  struct kmem_cache *cachep)
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2941
2942
2943
2944
  {
  	/*
  	 * We need to stop accounting when we kmalloc, because if the
  	 * corresponding kmalloc cache is not yet created, the first allocation
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2945
  	 * in __memcg_schedule_register_cache will recurse.
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2946
2947
2948
2949
2950
2951
2952
2953
  	 *
  	 * However, it is better to enclose the whole function. Depending on
  	 * the debugging options enabled, INIT_WORK(), for instance, can
  	 * trigger an allocation. This too, will make us recurse. Because at
  	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
  	 * the safest choice is to do it like this, wrapping the whole function.
  	 */
  	memcg_stop_kmem_account();
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2954
  	__memcg_schedule_register_cache(memcg, cachep);
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2955
2956
  	memcg_resume_kmem_account();
  }
c67a8a685   Vladimir Davydov   memcg, slab: merg...
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
  
  int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
  {
  	int res;
  
  	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp,
  				PAGE_SIZE << order);
  	if (!res)
  		atomic_add(1 << order, &cachep->memcg_params->nr_pages);
  	return res;
  }
  
  void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
  {
  	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order);
  	atomic_sub(1 << order, &cachep->memcg_params->nr_pages);
  }
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
  /*
   * Return the kmem_cache we're supposed to use for a slab allocation.
   * We try to use the current memcg's version of the cache.
   *
   * If the cache does not exist yet, if we are the first user of it,
   * we either create it immediately, if possible, or create it asynchronously
   * in a workqueue.
   * In the latter case, we will let the current allocation go through with
   * the original cache.
   *
   * Can't be called in interrupt context or from kernel threads.
   * This function needs to be called with rcu_read_lock() held.
   */
  struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
  					  gfp_t gfp)
  {
  	struct mem_cgroup *memcg;
959c8963f   Vladimir Davydov   memcg, slab: fix ...
2991
  	struct kmem_cache *memcg_cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2992
2993
2994
  
  	VM_BUG_ON(!cachep->memcg_params);
  	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2995
2996
  	if (!current->mm || current->memcg_kmem_skip_account)
  		return cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2997
2998
  	rcu_read_lock();
  	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2999

cf2b8fbf1   Vladimir Davydov   memcg: zap memcg_...
3000
  	if (!memcg_kmem_is_active(memcg))
ca0dde971   Li Zefan   memcg: take refer...
3001
  		goto out;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
3002

959c8963f   Vladimir Davydov   memcg, slab: fix ...
3003
3004
3005
  	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
  	if (likely(memcg_cachep)) {
  		cachep = memcg_cachep;
ca0dde971   Li Zefan   memcg: take refer...
3006
  		goto out;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
3007
  	}
ca0dde971   Li Zefan   memcg: take refer...
3008
  	/* The corresponding put will be done in the workqueue. */
ec903c0c8   Tejun Heo   cgroup: rename cs...
3009
  	if (!css_tryget_online(&memcg->css))
ca0dde971   Li Zefan   memcg: take refer...
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
  		goto out;
  	rcu_read_unlock();
  
  	/*
  	 * If we are in a safe context (can wait, and not in interrupt
  	 * context), we could be be predictable and return right away.
  	 * This would guarantee that the allocation being performed
  	 * already belongs in the new cache.
  	 *
  	 * However, there are some clashes that can arrive from locking.
  	 * For instance, because we acquire the slab_mutex while doing
776ed0f03   Vladimir Davydov   memcg: cleanup km...
3021
3022
3023
  	 * memcg_create_kmem_cache, this means no further allocation
  	 * could happen with the slab_mutex held. So it's better to
  	 * defer everything.
ca0dde971   Li Zefan   memcg: take refer...
3024
  	 */
776ed0f03   Vladimir Davydov   memcg: cleanup km...
3025
  	memcg_schedule_register_cache(memcg, cachep);
ca0dde971   Li Zefan   memcg: take refer...
3026
3027
3028
3029
  	return cachep;
  out:
  	rcu_read_unlock();
  	return cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
3030
  }
d7f25f8a2   Glauber Costa   memcg: infrastruc...
3031

7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
  /*
   * We need to verify if the allocation against current->mm->owner's memcg is
   * possible for the given order. But the page is not allocated yet, so we'll
   * need a further commit step to do the final arrangements.
   *
   * It is possible for the task to switch cgroups in this mean time, so at
   * commit time, we can't rely on task conversion any longer.  We'll then use
   * the handle argument to return to the caller which cgroup we should commit
   * against. We could also return the memcg directly and avoid the pointer
   * passing, but a boolean return value gives better semantics considering
   * the compiled-out case as well.
   *
   * Returning true means the allocation is possible.
   */
  bool
  __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
  {
  	struct mem_cgroup *memcg;
  	int ret;
  
  	*_memcg = NULL;
6d42c232b   Glauber Costa   memcg: also test ...
3053
3054
3055
3056
  
  	/*
  	 * Disabling accounting is only relevant for some specific memcg
  	 * internal allocations. Therefore we would initially not have such
52383431b   Vladimir Davydov   mm: get rid of __...
3057
3058
3059
3060
3061
3062
  	 * check here, since direct calls to the page allocator that are
  	 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
  	 * outside memcg core. We are mostly concerned with cache allocations,
  	 * and by having this test at memcg_kmem_get_cache, we are already able
  	 * to relay the allocation to the root cache and bypass the memcg cache
  	 * altogether.
6d42c232b   Glauber Costa   memcg: also test ...
3063
3064
3065
3066
3067
3068
  	 *
  	 * There is one exception, though: the SLUB allocator does not create
  	 * large order caches, but rather service large kmallocs directly from
  	 * the page allocator. Therefore, the following sequence when backed by
  	 * the SLUB allocator:
  	 *
f894ffa86   Andrew Morton   memcg: trivial cl...
3069
3070
3071
  	 *	memcg_stop_kmem_account();
  	 *	kmalloc(<large_number>)
  	 *	memcg_resume_kmem_account();
6d42c232b   Glauber Costa   memcg: also test ...
3072
3073
3074
3075
3076
3077
3078
3079
3080
  	 *
  	 * would effectively ignore the fact that we should skip accounting,
  	 * since it will drive us directly to this function without passing
  	 * through the cache selector memcg_kmem_get_cache. Such large
  	 * allocations are extremely rare but can happen, for instance, for the
  	 * cache arrays. We bring this test here.
  	 */
  	if (!current->mm || current->memcg_kmem_skip_account)
  		return true;
df3819754   Johannes Weiner   memcg: get_mem_cg...
3081
  	memcg = get_mem_cgroup_from_mm(current->mm);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3082

cf2b8fbf1   Vladimir Davydov   memcg: zap memcg_...
3083
  	if (!memcg_kmem_is_active(memcg)) {
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3084
3085
3086
  		css_put(&memcg->css);
  		return true;
  	}
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3087
3088
3089
  	ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
  	if (!ret)
  		*_memcg = memcg;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
  
  	css_put(&memcg->css);
  	return (ret == 0);
  }
  
  void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
  			      int order)
  {
  	struct page_cgroup *pc;
  
  	VM_BUG_ON(mem_cgroup_is_root(memcg));
  
  	/* The page allocation failed. Revert */
  	if (!page) {
  		memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3105
3106
  		return;
  	}
a840cda63   Johannes Weiner   mm: memcontrol: d...
3107
3108
3109
3110
  	/*
  	 * The page is freshly allocated and not visible to any
  	 * outside callers yet.  Set up pc non-atomically.
  	 */
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3111
  	pc = lookup_page_cgroup(page);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3112
  	pc->mem_cgroup = memcg;
a840cda63   Johannes Weiner   mm: memcontrol: d...
3113
  	pc->flags = PCG_USED;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3114
3115
3116
3117
3118
3119
3120
3121
3122
  }
  
  void __memcg_kmem_uncharge_pages(struct page *page, int order)
  {
  	struct mem_cgroup *memcg = NULL;
  	struct page_cgroup *pc;
  
  
  	pc = lookup_page_cgroup(page);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3123
3124
  	if (!PageCgroupUsed(pc))
  		return;
a840cda63   Johannes Weiner   mm: memcontrol: d...
3125
3126
  	memcg = pc->mem_cgroup;
  	pc->flags = 0;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3127
3128
3129
3130
3131
3132
3133
  
  	/*
  	 * We trust that only if there is a memcg associated with the page, it
  	 * is a valid allocation
  	 */
  	if (!memcg)
  		return;
309381fea   Sasha Levin   mm: dump page whe...
3134
  	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3135
  	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3136
  }
1f458cbf1   Glauber Costa   memcg: destroy me...
3137
  #else
776ed0f03   Vladimir Davydov   memcg: cleanup km...
3138
  static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
1f458cbf1   Glauber Costa   memcg: destroy me...
3139
3140
  {
  }
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
3141
  #endif /* CONFIG_MEMCG_KMEM */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
3142
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
3143
3144
  /*
   * Because tail pages are not marked as "used", set it. We're under
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
3145
3146
3147
   * zone->lru_lock, 'splitting on pmd' and compound_lock.
   * charge/uncharge will be never happen and move_account() is done under
   * compound_lock(), so we don't have to take care of races.
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
3148
   */
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
3149
  void mem_cgroup_split_huge_fixup(struct page *head)
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
3150
3151
  {
  	struct page_cgroup *head_pc = lookup_page_cgroup(head);
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
3152
  	struct page_cgroup *pc;
b070e65c0   David Rientjes   mm, memcg: add rs...
3153
  	struct mem_cgroup *memcg;
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
3154
  	int i;
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
3155

3d37c4a91   KAMEZAWA Hiroyuki   memcg: bugfix che...
3156
3157
  	if (mem_cgroup_disabled())
  		return;
b070e65c0   David Rientjes   mm, memcg: add rs...
3158
3159
  
  	memcg = head_pc->mem_cgroup;
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
3160
3161
  	for (i = 1; i < HPAGE_PMD_NR; i++) {
  		pc = head_pc + i;
b070e65c0   David Rientjes   mm, memcg: add rs...
3162
  		pc->mem_cgroup = memcg;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3163
  		pc->flags = head_pc->flags;
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
3164
  	}
b070e65c0   David Rientjes   mm, memcg: add rs...
3165
3166
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
  		       HPAGE_PMD_NR);
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
3167
  }
12d271078   Hugh Dickins   memcg: fix split_...
3168
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
3169

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3170
  /**
de3638d9c   Johannes Weiner   memcg: fold __mem...
3171
   * mem_cgroup_move_account - move account of the page
5564e88ba   Johannes Weiner   memcg: condense p...
3172
   * @page: the page
7ec99d621   Johannes Weiner   memcg: unify char...
3173
   * @nr_pages: number of regular pages (>1 for huge pages)
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3174
3175
3176
3177
3178
   * @pc:	page_cgroup of the page.
   * @from: mem_cgroup which the page is moved from.
   * @to:	mem_cgroup which the page is moved to. @from != @to.
   *
   * The caller must confirm following.
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3179
   * - page is not on LRU (isolate_page() is useful.)
7ec99d621   Johannes Weiner   memcg: unify char...
3180
   * - compound_lock is held when nr_pages > 1
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3181
   *
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
3182
3183
   * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
   * from old cgroup.
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3184
   */
7ec99d621   Johannes Weiner   memcg: unify char...
3185
3186
3187
3188
  static int mem_cgroup_move_account(struct page *page,
  				   unsigned int nr_pages,
  				   struct page_cgroup *pc,
  				   struct mem_cgroup *from,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
3189
  				   struct mem_cgroup *to)
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3190
  {
de3638d9c   Johannes Weiner   memcg: fold __mem...
3191
3192
  	unsigned long flags;
  	int ret;
987eba66e   KAMEZAWA Hiroyuki   memcg: fix rmdir,...
3193

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3194
  	VM_BUG_ON(from == to);
309381fea   Sasha Levin   mm: dump page whe...
3195
  	VM_BUG_ON_PAGE(PageLRU(page), page);
de3638d9c   Johannes Weiner   memcg: fold __mem...
3196
3197
3198
3199
3200
3201
3202
  	/*
  	 * The page is isolated from LRU. So, collapse function
  	 * will not handle this page. But page splitting can happen.
  	 * Do this check under compound_page_lock(). The caller should
  	 * hold it.
  	 */
  	ret = -EBUSY;
7ec99d621   Johannes Weiner   memcg: unify char...
3203
  	if (nr_pages > 1 && !PageTransHuge(page))
de3638d9c   Johannes Weiner   memcg: fold __mem...
3204
  		goto out;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3205
3206
3207
3208
3209
3210
3211
  	/*
  	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
  	 * of its source page while we change it: page migration takes
  	 * both pages off the LRU, but page cache replacement doesn't.
  	 */
  	if (!trylock_page(page))
  		goto out;
de3638d9c   Johannes Weiner   memcg: fold __mem...
3212
3213
3214
  
  	ret = -EINVAL;
  	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3215
  		goto out_unlock;
de3638d9c   Johannes Weiner   memcg: fold __mem...
3216

312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
3217
  	move_lock_mem_cgroup(from, &flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3218

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3219
  	if (!PageAnon(page) && page_mapped(page)) {
59d1d256e   Johannes Weiner   mm: memcg: remove...
3220
3221
3222
3223
3224
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  	}
3ea67d06e   Sha Zhengju   memcg: add per cg...
3225

59d1d256e   Johannes Weiner   mm: memcg: remove...
3226
3227
3228
3229
3230
3231
  	if (PageWriteback(page)) {
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  	}
3ea67d06e   Sha Zhengju   memcg: add per cg...
3232

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3233
3234
3235
3236
3237
  	/*
  	 * It is safe to change pc->mem_cgroup here because the page
  	 * is referenced, charged, and isolated - we can't race with
  	 * uncharging, charging, migration, or LRU putback.
  	 */
d69b042f3   Balbir Singh   memcg: add file-b...
3238

854ffa8d1   Daisuke Nishimura   memcg: improve pe...
3239
  	/* caller should have done css_get */
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3240
  	pc->mem_cgroup = to;
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
3241
  	move_unlock_mem_cgroup(from, &flags);
de3638d9c   Johannes Weiner   memcg: fold __mem...
3242
  	ret = 0;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3243
3244
3245
  
  	local_irq_disable();
  	mem_cgroup_charge_statistics(to, page, nr_pages);
5564e88ba   Johannes Weiner   memcg: condense p...
3246
  	memcg_check_events(to, page);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3247
  	mem_cgroup_charge_statistics(from, page, -nr_pages);
5564e88ba   Johannes Weiner   memcg: condense p...
3248
  	memcg_check_events(from, page);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3249
3250
3251
  	local_irq_enable();
  out_unlock:
  	unlock_page(page);
de3638d9c   Johannes Weiner   memcg: fold __mem...
3252
  out:
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3253
3254
  	return ret;
  }
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
  /**
   * mem_cgroup_move_parent - moves page to the parent group
   * @page: the page to move
   * @pc: page_cgroup of the page
   * @child: page's cgroup
   *
   * move charges to its parent or the root cgroup if the group has no
   * parent (aka use_hierarchy==0).
   * Although this might fail (get_page_unless_zero, isolate_lru_page or
   * mem_cgroup_move_account fails) the failure is always temporary and
   * it signals a race with a page removal/uncharge or migration. In the
   * first case the page is on the way out and it will vanish from the LRU
   * on the next attempt and the call should be retried later.
   * Isolation from the LRU fails only if page has been isolated from
   * the LRU since we looked at it and that usually means either global
   * reclaim or migration going on. The page will either get back to the
   * LRU or vanish.
   * Finaly mem_cgroup_move_account fails only if the page got uncharged
   * (!PageCgroupUsed) or moved to a different group. The page will
   * disappear in the next attempt.
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3275
   */
5564e88ba   Johannes Weiner   memcg: condense p...
3276
3277
  static int mem_cgroup_move_parent(struct page *page,
  				  struct page_cgroup *pc,
6068bf010   KAMEZAWA Hiroyuki   memcg: mem_cgroup...
3278
  				  struct mem_cgroup *child)
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3279
  {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3280
  	struct mem_cgroup *parent;
7ec99d621   Johannes Weiner   memcg: unify char...
3281
  	unsigned int nr_pages;
4be4489fe   Andrew Morton   mm/memcontrol.c: ...
3282
  	unsigned long uninitialized_var(flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3283
  	int ret;
d84230118   Michal Hocko   memcg: root_cgrou...
3284
  	VM_BUG_ON(mem_cgroup_is_root(child));
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3285

57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
3286
3287
3288
3289
3290
  	ret = -EBUSY;
  	if (!get_page_unless_zero(page))
  		goto out;
  	if (isolate_lru_page(page))
  		goto put;
52dbb9050   KAMEZAWA Hiroyuki   memcg: fix race a...
3291

7ec99d621   Johannes Weiner   memcg: unify char...
3292
  	nr_pages = hpage_nr_pages(page);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3293

cc926f784   KAMEZAWA Hiroyuki   memcg: move charg...
3294
3295
3296
3297
3298
3299
  	parent = parent_mem_cgroup(child);
  	/*
  	 * If no parent, move charges to root cgroup.
  	 */
  	if (!parent)
  		parent = root_mem_cgroup;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3300

2ef37d3fe   Michal Hocko   memcg: Simplify m...
3301
  	if (nr_pages > 1) {
309381fea   Sasha Levin   mm: dump page whe...
3302
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
987eba66e   KAMEZAWA Hiroyuki   memcg: fix rmdir,...
3303
  		flags = compound_lock_irqsave(page);
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3304
  	}
987eba66e   KAMEZAWA Hiroyuki   memcg: fix rmdir,...
3305

cc926f784   KAMEZAWA Hiroyuki   memcg: move charg...
3306
  	ret = mem_cgroup_move_account(page, nr_pages,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
3307
  				pc, child, parent);
cc926f784   KAMEZAWA Hiroyuki   memcg: move charg...
3308
3309
  	if (!ret)
  		__mem_cgroup_cancel_local_charge(child, nr_pages);
8dba474f0   Jesper Juhl   mm/memcontrol.c: ...
3310

7ec99d621   Johannes Weiner   memcg: unify char...
3311
  	if (nr_pages > 1)
987eba66e   KAMEZAWA Hiroyuki   memcg: fix rmdir,...
3312
  		compound_unlock_irqrestore(page, flags);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3313
  	putback_lru_page(page);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
3314
  put:
40d58138f   Daisuke Nishimura   memcg: fix error ...
3315
  	put_page(page);
57f9fd7d2   Daisuke Nishimura   memcg: cleanup me...
3316
  out:
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3317
3318
  	return ret;
  }
c255a4580   Andrew Morton   memcg: rename con...
3319
  #ifdef CONFIG_MEMCG_SWAP
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3320
3321
  static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
  					 bool charge)
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
3322
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3323
3324
  	int val = (charge) ? 1 : -1;
  	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
3325
  }
024914477   Daisuke Nishimura   memcg: move charg...
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
  
  /**
   * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   * @entry: swap entry to be moved
   * @from:  mem_cgroup which the entry is moved from
   * @to:  mem_cgroup which the entry is moved to
   *
   * It succeeds only when the swap_cgroup's record for this entry is the same
   * as the mem_cgroup's id of @from.
   *
   * Returns 0 on success, -EINVAL on failure.
   *
   * The caller must have charged to @to, IOW, called res_counter_charge() about
   * both res and memsw, and called css_get().
   */
  static int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
3342
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
3343
3344
  {
  	unsigned short old_id, new_id;
34c00c319   Li Zefan   memcg: convert to...
3345
3346
  	old_id = mem_cgroup_id(from);
  	new_id = mem_cgroup_id(to);
024914477   Daisuke Nishimura   memcg: move charg...
3347
3348
  
  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
024914477   Daisuke Nishimura   memcg: move charg...
3349
  		mem_cgroup_swap_statistics(from, false);
483c30b51   Daisuke Nishimura   memcg: improve pe...
3350
  		mem_cgroup_swap_statistics(to, true);
024914477   Daisuke Nishimura   memcg: move charg...
3351
  		/*
483c30b51   Daisuke Nishimura   memcg: improve pe...
3352
3353
3354
  		 * This function is only called from task migration context now.
  		 * It postpones res_counter and refcount handling till the end
  		 * of task migration(mem_cgroup_clear_mc()) for performance
4050377b5   Li Zefan   memcg: use css_ge...
3355
3356
3357
3358
3359
3360
  		 * improvement. But we cannot postpone css_get(to)  because if
  		 * the process that has been moved to @to does swap-in, the
  		 * refcount of @to might be decreased to 0.
  		 *
  		 * We are in attach() phase, so the cgroup is guaranteed to be
  		 * alive, so we can just call css_get().
024914477   Daisuke Nishimura   memcg: move charg...
3361
  		 */
4050377b5   Li Zefan   memcg: use css_ge...
3362
  		css_get(&to->css);
024914477   Daisuke Nishimura   memcg: move charg...
3363
3364
3365
3366
3367
3368
  		return 0;
  	}
  	return -EINVAL;
  }
  #else
  static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
3369
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
3370
3371
3372
  {
  	return -EINVAL;
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3373
  #endif
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
3374

f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
3375
3376
3377
3378
3379
3380
  #ifdef CONFIG_DEBUG_VM
  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
  {
  	struct page_cgroup *pc;
  
  	pc = lookup_page_cgroup(page);
cfa449461   Johannes Weiner   mm: memcg: lookup...
3381
3382
3383
3384
3385
  	/*
  	 * Can be NULL while feeding pages into the page allocator for
  	 * the first time, i.e. during boot or memory hotplug;
  	 * or when mem_cgroup_disabled().
  	 */
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
  	if (likely(pc) && PageCgroupUsed(pc))
  		return pc;
  	return NULL;
  }
  
  bool mem_cgroup_bad_page_check(struct page *page)
  {
  	if (mem_cgroup_disabled())
  		return false;
  
  	return lookup_page_cgroup_used(page) != NULL;
  }
  
  void mem_cgroup_print_bad_page(struct page *page)
  {
  	struct page_cgroup *pc;
  
  	pc = lookup_page_cgroup_used(page);
  	if (pc) {
d045197ff   Andrew Morton   mm/memcontrol.c: ...
3405
3406
3407
  		pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p
  ",
  			 pc, pc->flags, pc->mem_cgroup);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
3408
3409
3410
  	}
  }
  #endif
d38d2a758   KOSAKI Motohiro   mm: make mem_cgro...
3411
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3412
  				unsigned long long val)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3413
  {
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3414
  	int retry_count;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3415
  	int ret = 0;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3416
3417
  	int children = mem_cgroup_count_children(memcg);
  	u64 curusage, oldusage;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3418
  	int enlarge;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3419
3420
3421
3422
3423
3424
3425
3426
3427
  
  	/*
  	 * For keeping hierarchical_reclaim simple, how long we should retry
  	 * is depends on callers. We set our retry-count to be function
  	 * of # of children which we should visit in this loop.
  	 */
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
  
  	oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3428

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3429
  	enlarge = 0;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3430
  	while (retry_count) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3431
3432
3433
3434
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3435
3436
3437
  		/*
  		 * Rather than hide all in some function, I do this in
  		 * open coded manner. You see what this really does.
aaad153e3   Wanpeng Li   mm/memcg: mem_cgr...
3438
  		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3439
3440
  		 */
  		mutex_lock(&set_limit_mutex);
3fbe72442   Johannes Weiner   mm: memcontrol: s...
3441
  		if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3442
3443
  			ret = -EINVAL;
  			mutex_unlock(&set_limit_mutex);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3444
3445
  			break;
  		}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3446

3fbe72442   Johannes Weiner   mm: memcontrol: s...
3447
  		if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3448
  			enlarge = 1;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3449
3450
3451
3452
3453
  		ret = res_counter_set_limit(&memcg->res, val);
  		mutex_unlock(&set_limit_mutex);
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3454
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3455
3456
  		curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
  		/* Usage is reduced ? */
f894ffa86   Andrew Morton   memcg: trivial cl...
3457
  		if (curusage >= oldusage)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3458
3459
3460
  			retry_count--;
  		else
  			oldusage = curusage;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3461
  	}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3462
3463
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
14797e236   KOSAKI Motohiro   memcg: add inacti...
3464

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3465
3466
  	return ret;
  }
338c84310   Li Zefan   memcg: remove som...
3467
3468
  static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
  					unsigned long long val)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3469
  {
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3470
  	int retry_count;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
3471
  	u64 oldusage, curusage;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3472
3473
  	int children = mem_cgroup_count_children(memcg);
  	int ret = -EBUSY;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3474
  	int enlarge = 0;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3475

81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3476
  	/* see mem_cgroup_resize_res_limit */
f894ffa86   Andrew Morton   memcg: trivial cl...
3477
  	retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3478
  	oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3479
3480
3481
3482
3483
3484
3485
3486
  	while (retry_count) {
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
  		/*
  		 * Rather than hide all in some function, I do this in
  		 * open coded manner. You see what this really does.
aaad153e3   Wanpeng Li   mm/memcg: mem_cgr...
3487
  		 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3488
3489
  		 */
  		mutex_lock(&set_limit_mutex);
3fbe72442   Johannes Weiner   mm: memcontrol: s...
3490
  		if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3491
3492
3493
3494
  			ret = -EINVAL;
  			mutex_unlock(&set_limit_mutex);
  			break;
  		}
3fbe72442   Johannes Weiner   mm: memcontrol: s...
3495
  		if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3496
  			enlarge = 1;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3497
3498
3499
3500
3501
  		ret = res_counter_set_limit(&memcg->memsw, val);
  		mutex_unlock(&set_limit_mutex);
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3502
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3503
  		curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3504
  		/* Usage is reduced ? */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3505
  		if (curusage >= oldusage)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3506
  			retry_count--;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
3507
3508
  		else
  			oldusage = curusage;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3509
  	}
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3510
3511
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3512
3513
  	return ret;
  }
0608f43da   Andrew Morton   revert "memcg, vm...
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  					    gfp_t gfp_mask,
  					    unsigned long *total_scanned)
  {
  	unsigned long nr_reclaimed = 0;
  	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
  	unsigned long reclaimed;
  	int loop = 0;
  	struct mem_cgroup_tree_per_zone *mctz;
  	unsigned long long excess;
  	unsigned long nr_scanned;
  
  	if (order > 0)
  		return 0;
  
  	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
  	/*
  	 * This loop can run a while, specially if mem_cgroup's continuously
  	 * keep exceeding their soft limit and putting the system under
  	 * pressure
  	 */
  	do {
  		if (next_mz)
  			mz = next_mz;
  		else
  			mz = mem_cgroup_largest_soft_limit_node(mctz);
  		if (!mz)
  			break;
  
  		nr_scanned = 0;
  		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
  						    gfp_mask, &nr_scanned);
  		nr_reclaimed += reclaimed;
  		*total_scanned += nr_scanned;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3548
  		spin_lock_irq(&mctz->lock);
0608f43da   Andrew Morton   revert "memcg, vm...
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
  
  		/*
  		 * If we failed to reclaim anything from this memory cgroup
  		 * it is time to move on to the next cgroup
  		 */
  		next_mz = NULL;
  		if (!reclaimed) {
  			do {
  				/*
  				 * Loop until we find yet another one.
  				 *
  				 * By the time we get the soft_limit lock
  				 * again, someone might have aded the
  				 * group back on the RB tree. Iterate to
  				 * make sure we get a different mem.
  				 * mem_cgroup_largest_soft_limit_node returns
  				 * NULL if no other cgroup is present on
  				 * the tree
  				 */
  				next_mz =
  				__mem_cgroup_largest_soft_limit_node(mctz);
  				if (next_mz == mz)
  					css_put(&next_mz->memcg->css);
  				else /* next_mz == NULL or other memcg */
  					break;
  			} while (1);
  		}
cf2c81279   Johannes Weiner   mm: memcontrol: r...
3576
  		__mem_cgroup_remove_exceeded(mz, mctz);
0608f43da   Andrew Morton   revert "memcg, vm...
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
  		excess = res_counter_soft_limit_excess(&mz->memcg->res);
  		/*
  		 * One school of thought says that we should not add
  		 * back the node to the tree if reclaim returns 0.
  		 * But our reclaim could return 0, simply because due
  		 * to priority we are exposing a smaller subset of
  		 * memory to reclaim from. Consider this as a longer
  		 * term TODO.
  		 */
  		/* If excess == 0, no tree ops */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
3587
  		__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
3588
  		spin_unlock_irq(&mctz->lock);
0608f43da   Andrew Morton   revert "memcg, vm...
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
  		css_put(&mz->memcg->css);
  		loop++;
  		/*
  		 * Could not reclaim anything and there are no more
  		 * mem cgroups to try or we seem to be looping without
  		 * reclaiming anything.
  		 */
  		if (!nr_reclaimed &&
  			(next_mz == NULL ||
  			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
  			break;
  	} while (!nr_reclaimed);
  	if (next_mz)
  		css_put(&next_mz->memcg->css);
  	return nr_reclaimed;
  }
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3605
3606
3607
3608
3609
3610
3611
  /**
   * mem_cgroup_force_empty_list - clears LRU of a group
   * @memcg: group to clear
   * @node: NUMA node
   * @zid: zone id
   * @lru: lru to to clear
   *
3c935d189   KAMEZAWA Hiroyuki   memcg: make mem_c...
3612
   * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3613
3614
   * reclaim the pages page themselves - pages are moved to the parent (or root)
   * group.
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3615
   */
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3616
  static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3617
  				int node, int zid, enum lru_list lru)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3618
  {
bea8c150a   Hugh Dickins   memcg: fix hotplu...
3619
  	struct lruvec *lruvec;
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3620
  	unsigned long flags;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
3621
  	struct list_head *list;
925b7673c   Johannes Weiner   mm: make per-memc...
3622
3623
  	struct page *busy;
  	struct zone *zone;
072c56c13   KAMEZAWA Hiroyuki   per-zone and recl...
3624

08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3625
  	zone = &NODE_DATA(node)->node_zones[zid];
bea8c150a   Hugh Dickins   memcg: fix hotplu...
3626
3627
  	lruvec = mem_cgroup_zone_lruvec(zone, memcg);
  	list = &lruvec->lists[lru];
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3628

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3629
  	busy = NULL;
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3630
  	do {
925b7673c   Johannes Weiner   mm: make per-memc...
3631
  		struct page_cgroup *pc;
5564e88ba   Johannes Weiner   memcg: condense p...
3632
  		struct page *page;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3633
  		spin_lock_irqsave(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3634
  		if (list_empty(list)) {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3635
  			spin_unlock_irqrestore(&zone->lru_lock, flags);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3636
  			break;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3637
  		}
925b7673c   Johannes Weiner   mm: make per-memc...
3638
3639
3640
  		page = list_entry(list->prev, struct page, lru);
  		if (busy == page) {
  			list_move(&page->lru, list);
648bcc771   Thiago Farina   mm/memcontrol.c: ...
3641
  			busy = NULL;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3642
  			spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3643
3644
  			continue;
  		}
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
3645
  		spin_unlock_irqrestore(&zone->lru_lock, flags);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3646

925b7673c   Johannes Weiner   mm: make per-memc...
3647
  		pc = lookup_page_cgroup(page);
5564e88ba   Johannes Weiner   memcg: condense p...
3648

3c935d189   KAMEZAWA Hiroyuki   memcg: make mem_c...
3649
  		if (mem_cgroup_move_parent(page, pc, memcg)) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3650
  			/* found lock contention or "pc" is obsolete. */
925b7673c   Johannes Weiner   mm: make per-memc...
3651
  			busy = page;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3652
3653
  		} else
  			busy = NULL;
2a7a0e0fd   Hugh Dickins   mm, memcg: period...
3654
  		cond_resched();
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3655
  	} while (!list_empty(list));
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3656
3657
3658
  }
  
  /*
c26251f9f   Michal Hocko   memcg: split mem_...
3659
3660
   * make mem_cgroup's charge to be 0 if there is no task by moving
   * all the charges and pages to the parent.
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3661
   * This enables deleting this mem_cgroup.
c26251f9f   Michal Hocko   memcg: split mem_...
3662
3663
   *
   * Caller is responsible for holding css reference on the memcg.
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3664
   */
ab5196c20   Michal Hocko   memcg: make mem_c...
3665
  static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3666
  {
c26251f9f   Michal Hocko   memcg: split mem_...
3667
  	int node, zid;
bea207c86   Glauber Costa   memcg: allow a me...
3668
  	u64 usage;
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3669

fce664775   Daisuke Nishimura   memcg: ensure lis...
3670
  	do {
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3671
3672
  		/* This is for making all *used* pages to be on LRU. */
  		lru_add_drain_all();
c0ff4b854   Raghavendra K T   memcg: rename mem...
3673
  		drain_all_stock_sync(memcg);
c0ff4b854   Raghavendra K T   memcg: rename mem...
3674
  		mem_cgroup_start_move(memcg);
31aaea4aa   Lai Jiangshan   memcontrol: use N...
3675
  		for_each_node_state(node, N_MEMORY) {
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3676
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
f156ab933   Hugh Dickins   memcg: enum lru_l...
3677
3678
  				enum lru_list lru;
  				for_each_lru(lru) {
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3679
  					mem_cgroup_force_empty_list(memcg,
f156ab933   Hugh Dickins   memcg: enum lru_l...
3680
  							node, zid, lru);
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3681
  				}
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3682
  			}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3683
  		}
c0ff4b854   Raghavendra K T   memcg: rename mem...
3684
3685
  		mem_cgroup_end_move(memcg);
  		memcg_oom_recover(memcg);
52d4b9ac0   KAMEZAWA Hiroyuki   memcg: allocate a...
3686
  		cond_resched();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3687

2ef37d3fe   Michal Hocko   memcg: Simplify m...
3688
  		/*
bea207c86   Glauber Costa   memcg: allow a me...
3689
3690
3691
3692
3693
  		 * Kernel memory may not necessarily be trackable to a specific
  		 * process. So they are not migrated, and therefore we can't
  		 * expect their value to drop to 0 here.
  		 * Having res filled up with kmem only is enough.
  		 *
2ef37d3fe   Michal Hocko   memcg: Simplify m...
3694
3695
3696
3697
3698
3699
  		 * This is a safety check because mem_cgroup_force_empty_list
  		 * could have raced with mem_cgroup_replace_page_cache callers
  		 * so the lru seemed empty but the page could have been added
  		 * right after the check. RES_USAGE should be safe as we always
  		 * charge before adding to the LRU.
  		 */
bea207c86   Glauber Costa   memcg: allow a me...
3700
3701
3702
  		usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
  			res_counter_read_u64(&memcg->kmem, RES_USAGE);
  	} while (usage > 0);
c26251f9f   Michal Hocko   memcg: split mem_...
3703
  }
ea280e7b4   Tejun Heo   memcg: update mem...
3704
3705
3706
3707
3708
3709
  /*
   * Test whether @memcg has children, dead or alive.  Note that this
   * function doesn't care whether @memcg has use_hierarchy enabled and
   * returns %true if there are child csses according to the cgroup
   * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
   */
b5f99b537   Glauber Costa   memcg: fast hiera...
3710
3711
  static inline bool memcg_has_children(struct mem_cgroup *memcg)
  {
ea280e7b4   Tejun Heo   memcg: update mem...
3712
  	bool ret;
696ac172f   Johannes Weiner   mm: memcg: fix te...
3713
  	/*
ea280e7b4   Tejun Heo   memcg: update mem...
3714
3715
3716
3717
  	 * The lock does not prevent addition or deletion of children, but
  	 * it prevents a new child from being initialized based on this
  	 * parent in css_online(), so it's enough to decide whether
  	 * hierarchically inherited attributes can still be changed or not.
696ac172f   Johannes Weiner   mm: memcg: fix te...
3718
  	 */
ea280e7b4   Tejun Heo   memcg: update mem...
3719
3720
3721
3722
3723
3724
  	lockdep_assert_held(&memcg_create_mutex);
  
  	rcu_read_lock();
  	ret = css_next_child(NULL, &memcg->css);
  	rcu_read_unlock();
  	return ret;
b5f99b537   Glauber Costa   memcg: fast hiera...
3725
3726
3727
  }
  
  /*
c26251f9f   Michal Hocko   memcg: split mem_...
3728
3729
3730
3731
3732
3733
3734
3735
   * Reclaims as many pages from the given memcg as possible and moves
   * the rest to the parent.
   *
   * Caller is responsible for holding css reference for memcg.
   */
  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
  {
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
c26251f9f   Michal Hocko   memcg: split mem_...
3736

c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3737
3738
  	/* we call try-to-free pages for make this cgroup empty */
  	lru_add_drain_all();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3739
  	/* try to free all pages in this cgroup */
569530fb1   Glauber Costa   memcg: do not ope...
3740
  	while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3741
  		int progress;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3742

c26251f9f   Michal Hocko   memcg: split mem_...
3743
3744
  		if (signal_pending(current))
  			return -EINTR;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
3745
3746
  		progress = try_to_free_mem_cgroup_pages(memcg, 1,
  							GFP_KERNEL, true);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3747
  		if (!progress) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3748
  			nr_retries--;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3749
  			/* maybe some writeback is necessary */
8aa7e847d   Jens Axboe   Fix congestion_wa...
3750
  			congestion_wait(BLK_RW_ASYNC, HZ/10);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3751
  		}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
3752
3753
  
  	}
ab5196c20   Michal Hocko   memcg: make mem_c...
3754
3755
  
  	return 0;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
3756
  }
6770c64e5   Tejun Heo   cgroup: replace c...
3757
3758
3759
  static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
  					    char *buf, size_t nbytes,
  					    loff_t off)
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3760
  {
6770c64e5   Tejun Heo   cgroup: replace c...
3761
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
c26251f9f   Michal Hocko   memcg: split mem_...
3762

d84230118   Michal Hocko   memcg: root_cgrou...
3763
3764
  	if (mem_cgroup_is_root(memcg))
  		return -EINVAL;
6770c64e5   Tejun Heo   cgroup: replace c...
3765
  	return mem_cgroup_force_empty(memcg) ?: nbytes;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3766
  }
182446d08   Tejun Heo   cgroup: pass arou...
3767
3768
  static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
  				     struct cftype *cft)
18f59ea7d   Balbir Singh   memcg: memory cgr...
3769
  {
182446d08   Tejun Heo   cgroup: pass arou...
3770
  	return mem_cgroup_from_css(css)->use_hierarchy;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3771
  }
182446d08   Tejun Heo   cgroup: pass arou...
3772
3773
  static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
  				      struct cftype *cft, u64 val)
18f59ea7d   Balbir Singh   memcg: memory cgr...
3774
3775
  {
  	int retval = 0;
182446d08   Tejun Heo   cgroup: pass arou...
3776
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5c9d535b8   Tejun Heo   cgroup: remove cs...
3777
  	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3778

0999821b1   Glauber Costa   memcg: replace cg...
3779
  	mutex_lock(&memcg_create_mutex);
567fb435b   Glauber Costa   memcg: fix bad be...
3780
3781
3782
  
  	if (memcg->use_hierarchy == val)
  		goto out;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3783
  	/*
af901ca18   André Goddard Rosa   tree-wide: fix as...
3784
  	 * If parent's use_hierarchy is set, we can't make any modifications
18f59ea7d   Balbir Singh   memcg: memory cgr...
3785
3786
3787
3788
3789
3790
  	 * in the child subtrees. If it is unset, then the change can
  	 * occur, provided the current cgroup has no children.
  	 *
  	 * For the root cgroup, parent_mem is NULL, we allow value to be
  	 * set if there are no children.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3791
  	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
18f59ea7d   Balbir Singh   memcg: memory cgr...
3792
  				(val == 1 || val == 0)) {
ea280e7b4   Tejun Heo   memcg: update mem...
3793
  		if (!memcg_has_children(memcg))
c0ff4b854   Raghavendra K T   memcg: rename mem...
3794
  			memcg->use_hierarchy = val;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3795
3796
3797
3798
  		else
  			retval = -EBUSY;
  	} else
  		retval = -EINVAL;
567fb435b   Glauber Costa   memcg: fix bad be...
3799
3800
  
  out:
0999821b1   Glauber Costa   memcg: replace cg...
3801
  	mutex_unlock(&memcg_create_mutex);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3802
3803
3804
  
  	return retval;
  }
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
  static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
  					       enum mem_cgroup_stat_index idx)
  {
  	struct mem_cgroup *iter;
  	long val = 0;
  
  	/* Per-cpu values can be negative, use a signed accumulator */
  	for_each_mem_cgroup_tree(iter, memcg)
  		val += mem_cgroup_read_stat(iter, idx);
  
  	if (val < 0) /* race ? */
  		val = 0;
  	return val;
  }
  
  static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
  {
  	u64 val;
  
  	if (!mem_cgroup_is_root(memcg)) {
  		if (!swap)
  			return res_counter_read_u64(&memcg->res, RES_USAGE);
  		else
  			return res_counter_read_u64(&memcg->memsw, RES_USAGE);
  	}
  
  	/*
  	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
  	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
  	 */
  	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
  	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
  
  	if (swap)
  		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
  
  	return val << PAGE_SHIFT;
  }
791badbdb   Tejun Heo   memcg: convert aw...
3843
  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
05b843012   Johannes Weiner   mm: memcontrol: u...
3844
  			       struct cftype *cft)
8cdea7c05   Balbir Singh   Memory controller...
3845
  {
182446d08   Tejun Heo   cgroup: pass arou...
3846
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
05b843012   Johannes Weiner   mm: memcontrol: u...
3847
3848
  	enum res_type type = MEMFILE_TYPE(cft->private);
  	int name = MEMFILE_ATTR(cft->private);
af36f906c   Tejun Heo   memcg: always cre...
3849

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3850
3851
  	switch (type) {
  	case _MEM:
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3852
3853
  		if (name == RES_USAGE)
  			return mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
3854
  		return res_counter_read_u64(&memcg->res, name);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3855
  	case _MEMSWAP:
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3856
3857
  		if (name == RES_USAGE)
  			return mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
3858
  		return res_counter_read_u64(&memcg->memsw, name);
510fc4e11   Glauber Costa   memcg: kmem accou...
3859
  	case _KMEM:
05b843012   Johannes Weiner   mm: memcontrol: u...
3860
  		return res_counter_read_u64(&memcg->kmem, name);
510fc4e11   Glauber Costa   memcg: kmem accou...
3861
  		break;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3862
3863
  	default:
  		BUG();
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3864
  	}
8cdea7c05   Balbir Singh   Memory controller...
3865
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
3866

510fc4e11   Glauber Costa   memcg: kmem accou...
3867
  #ifdef CONFIG_MEMCG_KMEM
d64416377   Vladimir Davydov   memcg: rework mem...
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
  /* should be called with activate_kmem_mutex held */
  static int __memcg_activate_kmem(struct mem_cgroup *memcg,
  				 unsigned long long limit)
  {
  	int err = 0;
  	int memcg_id;
  
  	if (memcg_kmem_is_active(memcg))
  		return 0;
  
  	/*
  	 * We are going to allocate memory for data shared by all memory
  	 * cgroups so let's stop accounting here.
  	 */
  	memcg_stop_kmem_account();
510fc4e11   Glauber Costa   memcg: kmem accou...
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
  	/*
  	 * For simplicity, we won't allow this to be disabled.  It also can't
  	 * be changed if the cgroup has children already, or if tasks had
  	 * already joined.
  	 *
  	 * If tasks join before we set the limit, a person looking at
  	 * kmem.usage_in_bytes will have no way to determine when it took
  	 * place, which makes the value quite meaningless.
  	 *
  	 * After it first became limited, changes in the value of the limit are
  	 * of course permitted.
510fc4e11   Glauber Costa   memcg: kmem accou...
3894
  	 */
0999821b1   Glauber Costa   memcg: replace cg...
3895
  	mutex_lock(&memcg_create_mutex);
ea280e7b4   Tejun Heo   memcg: update mem...
3896
3897
  	if (cgroup_has_tasks(memcg->css.cgroup) ||
  	    (memcg->use_hierarchy && memcg_has_children(memcg)))
d64416377   Vladimir Davydov   memcg: rework mem...
3898
3899
3900
3901
  		err = -EBUSY;
  	mutex_unlock(&memcg_create_mutex);
  	if (err)
  		goto out;
510fc4e11   Glauber Costa   memcg: kmem accou...
3902

f3bb3043a   Vladimir Davydov   memcg: don't call...
3903
  	memcg_id = memcg_alloc_cache_id();
d64416377   Vladimir Davydov   memcg: rework mem...
3904
3905
3906
3907
  	if (memcg_id < 0) {
  		err = memcg_id;
  		goto out;
  	}
d64416377   Vladimir Davydov   memcg: rework mem...
3908
3909
  	memcg->kmemcg_id = memcg_id;
  	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
d64416377   Vladimir Davydov   memcg: rework mem...
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
  
  	/*
  	 * We couldn't have accounted to this cgroup, because it hasn't got the
  	 * active bit set yet, so this should succeed.
  	 */
  	err = res_counter_set_limit(&memcg->kmem, limit);
  	VM_BUG_ON(err);
  
  	static_key_slow_inc(&memcg_kmem_enabled_key);
  	/*
  	 * Setting the active bit after enabling static branching will
  	 * guarantee no one starts accounting before all call sites are
  	 * patched.
  	 */
  	memcg_kmem_set_active(memcg);
510fc4e11   Glauber Costa   memcg: kmem accou...
3925
  out:
d64416377   Vladimir Davydov   memcg: rework mem...
3926
3927
  	memcg_resume_kmem_account();
  	return err;
d64416377   Vladimir Davydov   memcg: rework mem...
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
  }
  
  static int memcg_activate_kmem(struct mem_cgroup *memcg,
  			       unsigned long long limit)
  {
  	int ret;
  
  	mutex_lock(&activate_kmem_mutex);
  	ret = __memcg_activate_kmem(memcg, limit);
  	mutex_unlock(&activate_kmem_mutex);
  	return ret;
  }
  
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
  				   unsigned long long val)
  {
  	int ret;
  
  	if (!memcg_kmem_is_active(memcg))
  		ret = memcg_activate_kmem(memcg, val);
  	else
  		ret = res_counter_set_limit(&memcg->kmem, val);
510fc4e11   Glauber Costa   memcg: kmem accou...
3950
3951
  	return ret;
  }
55007d849   Glauber Costa   memcg: allocate m...
3952
  static int memcg_propagate_kmem(struct mem_cgroup *memcg)
510fc4e11   Glauber Costa   memcg: kmem accou...
3953
  {
55007d849   Glauber Costa   memcg: allocate m...
3954
  	int ret = 0;
510fc4e11   Glauber Costa   memcg: kmem accou...
3955
  	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
55007d849   Glauber Costa   memcg: allocate m...
3956

d64416377   Vladimir Davydov   memcg: rework mem...
3957
3958
  	if (!parent)
  		return 0;
55007d849   Glauber Costa   memcg: allocate m...
3959

d64416377   Vladimir Davydov   memcg: rework mem...
3960
  	mutex_lock(&activate_kmem_mutex);
55007d849   Glauber Costa   memcg: allocate m...
3961
  	/*
d64416377   Vladimir Davydov   memcg: rework mem...
3962
3963
  	 * If the parent cgroup is not kmem-active now, it cannot be activated
  	 * after this point, because it has at least one child already.
55007d849   Glauber Costa   memcg: allocate m...
3964
  	 */
d64416377   Vladimir Davydov   memcg: rework mem...
3965
3966
3967
  	if (memcg_kmem_is_active(parent))
  		ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
  	mutex_unlock(&activate_kmem_mutex);
55007d849   Glauber Costa   memcg: allocate m...
3968
  	return ret;
510fc4e11   Glauber Costa   memcg: kmem accou...
3969
  }
d64416377   Vladimir Davydov   memcg: rework mem...
3970
3971
3972
3973
3974
3975
  #else
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
  				   unsigned long long val)
  {
  	return -EINVAL;
  }
6d0439904   Hugh Dickins   memcg: stop warni...
3976
  #endif /* CONFIG_MEMCG_KMEM */
510fc4e11   Glauber Costa   memcg: kmem accou...
3977

628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3978
3979
3980
3981
  /*
   * The user of this function is...
   * RES_LIMIT.
   */
451af504d   Tejun Heo   cgroup: replace c...
3982
3983
  static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
8cdea7c05   Balbir Singh   Memory controller...
3984
  {
451af504d   Tejun Heo   cgroup: replace c...
3985
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
86ae53e1a   Glauber Costa   memcg: change def...
3986
3987
  	enum res_type type;
  	int name;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3988
3989
  	unsigned long long val;
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
3990
3991
3992
  	buf = strstrip(buf);
  	type = MEMFILE_TYPE(of_cft(of)->private);
  	name = MEMFILE_ATTR(of_cft(of)->private);
af36f906c   Tejun Heo   memcg: always cre...
3993

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3994
  	switch (name) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3995
  	case RES_LIMIT:
4b3bde4c9   Balbir Singh   memcg: remove the...
3996
3997
3998
3999
  		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
  			ret = -EINVAL;
  			break;
  		}
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
4000
  		/* This function does all necessary parse...reuse it */
451af504d   Tejun Heo   cgroup: replace c...
4001
  		ret = res_counter_memparse_write_strategy(buf, &val);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4002
4003
4004
  		if (ret)
  			break;
  		if (type == _MEM)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
4005
  			ret = mem_cgroup_resize_limit(memcg, val);
510fc4e11   Glauber Costa   memcg: kmem accou...
4006
  		else if (type == _MEMSWAP)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4007
  			ret = mem_cgroup_resize_memsw_limit(memcg, val);
510fc4e11   Glauber Costa   memcg: kmem accou...
4008
  		else if (type == _KMEM)
d64416377   Vladimir Davydov   memcg: rework mem...
4009
  			ret = memcg_update_kmem_limit(memcg, val);
510fc4e11   Glauber Costa   memcg: kmem accou...
4010
4011
  		else
  			return -EINVAL;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
4012
  		break;
296c81d89   Balbir Singh   memory controller...
4013
  	case RES_SOFT_LIMIT:
451af504d   Tejun Heo   cgroup: replace c...
4014
  		ret = res_counter_memparse_write_strategy(buf, &val);
296c81d89   Balbir Singh   memory controller...
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
  		if (ret)
  			break;
  		/*
  		 * For memsw, soft limits are hard to implement in terms
  		 * of semantics, for now, we support soft limits for
  		 * control without swap
  		 */
  		if (type == _MEM)
  			ret = res_counter_set_soft_limit(&memcg->res, val);
  		else
  			ret = -EINVAL;
  		break;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
4027
4028
4029
4030
  	default:
  		ret = -EINVAL; /* should be BUG() ? */
  		break;
  	}
451af504d   Tejun Heo   cgroup: replace c...
4031
  	return ret ?: nbytes;
8cdea7c05   Balbir Singh   Memory controller...
4032
  }
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4033
4034
4035
  static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
  		unsigned long long *mem_limit, unsigned long long *memsw_limit)
  {
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4036
4037
4038
4039
  	unsigned long long min_limit, min_memsw_limit, tmp;
  
  	min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
  	min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4040
4041
  	if (!memcg->use_hierarchy)
  		goto out;
5c9d535b8   Tejun Heo   cgroup: remove cs...
4042
4043
  	while (memcg->css.parent) {
  		memcg = mem_cgroup_from_css(memcg->css.parent);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
  		if (!memcg->use_hierarchy)
  			break;
  		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
  		min_limit = min(min_limit, tmp);
  		tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
  		min_memsw_limit = min(min_memsw_limit, tmp);
  	}
  out:
  	*mem_limit = min_limit;
  	*memsw_limit = min_memsw_limit;
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4054
  }
6770c64e5   Tejun Heo   cgroup: replace c...
4055
4056
  static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
  				size_t nbytes, loff_t off)
c84872e16   Pavel Emelyanov   memcgroup: add th...
4057
  {
6770c64e5   Tejun Heo   cgroup: replace c...
4058
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
86ae53e1a   Glauber Costa   memcg: change def...
4059
4060
  	int name;
  	enum res_type type;
c84872e16   Pavel Emelyanov   memcgroup: add th...
4061

6770c64e5   Tejun Heo   cgroup: replace c...
4062
4063
  	type = MEMFILE_TYPE(of_cft(of)->private);
  	name = MEMFILE_ATTR(of_cft(of)->private);
af36f906c   Tejun Heo   memcg: always cre...
4064

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4065
  	switch (name) {
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
4066
  	case RES_MAX_USAGE:
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4067
  		if (type == _MEM)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4068
  			res_counter_reset_max(&memcg->res);
510fc4e11   Glauber Costa   memcg: kmem accou...
4069
  		else if (type == _MEMSWAP)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4070
  			res_counter_reset_max(&memcg->memsw);
510fc4e11   Glauber Costa   memcg: kmem accou...
4071
4072
4073
4074
  		else if (type == _KMEM)
  			res_counter_reset_max(&memcg->kmem);
  		else
  			return -EINVAL;
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
4075
4076
  		break;
  	case RES_FAILCNT:
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4077
  		if (type == _MEM)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4078
  			res_counter_reset_failcnt(&memcg->res);
510fc4e11   Glauber Costa   memcg: kmem accou...
4079
  		else if (type == _MEMSWAP)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4080
  			res_counter_reset_failcnt(&memcg->memsw);
510fc4e11   Glauber Costa   memcg: kmem accou...
4081
4082
4083
4084
  		else if (type == _KMEM)
  			res_counter_reset_failcnt(&memcg->kmem);
  		else
  			return -EINVAL;
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
4085
4086
  		break;
  	}
f64c3f549   Balbir Singh   memory controller...
4087

6770c64e5   Tejun Heo   cgroup: replace c...
4088
  	return nbytes;
c84872e16   Pavel Emelyanov   memcgroup: add th...
4089
  }
182446d08   Tejun Heo   cgroup: pass arou...
4090
  static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
4091
4092
  					struct cftype *cft)
  {
182446d08   Tejun Heo   cgroup: pass arou...
4093
  	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4094
  }
024914477   Daisuke Nishimura   memcg: move charg...
4095
  #ifdef CONFIG_MMU
182446d08   Tejun Heo   cgroup: pass arou...
4096
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
4097
4098
  					struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
4099
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4100
4101
4102
  
  	if (val >= (1 << NR_MOVE_TYPE))
  		return -EINVAL;
ee5e8472b   Glauber Costa   memcg: prevent ch...
4103

7dc74be03   Daisuke Nishimura   memcg: add interf...
4104
  	/*
ee5e8472b   Glauber Costa   memcg: prevent ch...
4105
4106
4107
4108
  	 * No kind of locking is needed in here, because ->can_attach() will
  	 * check this value once in the beginning of the process, and then carry
  	 * on with stale data. This means that changes to this value will only
  	 * affect task migrations starting after the change.
7dc74be03   Daisuke Nishimura   memcg: add interf...
4109
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
4110
  	memcg->move_charge_at_immigrate = val;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4111
4112
  	return 0;
  }
024914477   Daisuke Nishimura   memcg: move charg...
4113
  #else
182446d08   Tejun Heo   cgroup: pass arou...
4114
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
024914477   Daisuke Nishimura   memcg: move charg...
4115
4116
4117
4118
4119
  					struct cftype *cft, u64 val)
  {
  	return -ENOSYS;
  }
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
4120

406eb0c9b   Ying Han   memcg: add memory...
4121
  #ifdef CONFIG_NUMA
2da8ca822   Tejun Heo   cgroup: replace c...
4122
  static int memcg_numa_stat_show(struct seq_file *m, void *v)
406eb0c9b   Ying Han   memcg: add memory...
4123
  {
25485de6e   Greg Thelen   memcg: refactor m...
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
  	struct numa_stat {
  		const char *name;
  		unsigned int lru_mask;
  	};
  
  	static const struct numa_stat stats[] = {
  		{ "total", LRU_ALL },
  		{ "file", LRU_ALL_FILE },
  		{ "anon", LRU_ALL_ANON },
  		{ "unevictable", BIT(LRU_UNEVICTABLE) },
  	};
  	const struct numa_stat *stat;
406eb0c9b   Ying Han   memcg: add memory...
4136
  	int nid;
25485de6e   Greg Thelen   memcg: refactor m...
4137
  	unsigned long nr;
2da8ca822   Tejun Heo   cgroup: replace c...
4138
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
406eb0c9b   Ying Han   memcg: add memory...
4139

25485de6e   Greg Thelen   memcg: refactor m...
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
  		seq_printf(m, "%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
  							  stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
4150
  	}
406eb0c9b   Ying Han   memcg: add memory...
4151

071aee138   Ying Han   memcg: support hi...
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		struct mem_cgroup *iter;
  
  		nr = 0;
  		for_each_mem_cgroup_tree(iter, memcg)
  			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
  		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = 0;
  			for_each_mem_cgroup_tree(iter, memcg)
  				nr += mem_cgroup_node_nr_lru_pages(
  					iter, nid, stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
4168
  	}
406eb0c9b   Ying Han   memcg: add memory...
4169

406eb0c9b   Ying Han   memcg: add memory...
4170
4171
4172
  	return 0;
  }
  #endif /* CONFIG_NUMA */
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4173
4174
4175
4176
  static inline void mem_cgroup_lru_names_not_uptodate(void)
  {
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
  }
2da8ca822   Tejun Heo   cgroup: replace c...
4177
  static int memcg_stat_show(struct seq_file *m, void *v)
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4178
  {
2da8ca822   Tejun Heo   cgroup: replace c...
4179
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4180
4181
  	struct mem_cgroup *mi;
  	unsigned int i;
406eb0c9b   Ying Han   memcg: add memory...
4182

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4183
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
4184
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
4185
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4186
4187
4188
  		seq_printf(m, "%s %ld
  ", mem_cgroup_stat_names[i],
  			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
4189
  	}
7b854121e   Lee Schermerhorn   Unevictable LRU P...
4190

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4191
4192
4193
4194
4195
4196
4197
4198
4199
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_events_names[i],
  			   mem_cgroup_read_events(memcg, i));
  
  	for (i = 0; i < NR_LRU_LISTS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_lru_names[i],
  			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
4200
  	/* Hierarchical information */
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4201
4202
  	{
  		unsigned long long limit, memsw_limit;
d79154bb5   Hugh Dickins   memcg: replace me...
4203
  		memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
4204
4205
  		seq_printf(m, "hierarchical_memory_limit %llu
  ", limit);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4206
  		if (do_swap_account)
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
4207
4208
4209
  			seq_printf(m, "hierarchical_memsw_limit %llu
  ",
  				   memsw_limit);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
4210
  	}
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4211

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4212
4213
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
  		long long val = 0;
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
4214
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
4215
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
  		seq_printf(m, "total_%s %lld
  ", mem_cgroup_stat_names[i], val);
  	}
  
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_events(mi, i);
  		seq_printf(m, "total_%s %llu
  ",
  			   mem_cgroup_events_names[i], val);
  	}
  
  	for (i = 0; i < NR_LRU_LISTS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
  		seq_printf(m, "total_%s %llu
  ", mem_cgroup_lru_names[i], val);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
4239
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
4240

7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4241
  #ifdef CONFIG_DEBUG_VM
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4242
4243
4244
  	{
  		int nid, zid;
  		struct mem_cgroup_per_zone *mz;
89abfab13   Hugh Dickins   mm/memcg: move re...
4245
  		struct zone_reclaim_stat *rstat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4246
4247
4248
4249
4250
  		unsigned long recent_rotated[2] = {0, 0};
  		unsigned long recent_scanned[2] = {0, 0};
  
  		for_each_online_node(nid)
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
4251
  				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
89abfab13   Hugh Dickins   mm/memcg: move re...
4252
  				rstat = &mz->lruvec.reclaim_stat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4253

89abfab13   Hugh Dickins   mm/memcg: move re...
4254
4255
4256
4257
  				recent_rotated[0] += rstat->recent_rotated[0];
  				recent_rotated[1] += rstat->recent_rotated[1];
  				recent_scanned[0] += rstat->recent_scanned[0];
  				recent_scanned[1] += rstat->recent_scanned[1];
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4258
  			}
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
4259
4260
4261
4262
4263
4264
4265
4266
  		seq_printf(m, "recent_rotated_anon %lu
  ", recent_rotated[0]);
  		seq_printf(m, "recent_rotated_file %lu
  ", recent_rotated[1]);
  		seq_printf(m, "recent_scanned_anon %lu
  ", recent_scanned[0]);
  		seq_printf(m, "recent_scanned_file %lu
  ", recent_scanned[1]);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
4267
4268
  	}
  #endif
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4269
4270
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
4271
4272
  static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
  				      struct cftype *cft)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4273
  {
182446d08   Tejun Heo   cgroup: pass arou...
4274
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4275

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
4276
  	return mem_cgroup_swappiness(memcg);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4277
  }
182446d08   Tejun Heo   cgroup: pass arou...
4278
4279
  static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
  				       struct cftype *cft, u64 val)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4280
  {
182446d08   Tejun Heo   cgroup: pass arou...
4281
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4282

3dae7fec5   Johannes Weiner   mm: memcontrol: r...
4283
  	if (val > 100)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4284
  		return -EINVAL;
14208b0ec   Linus Torvalds   Merge branch 'for...
4285
  	if (css->parent)
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
4286
4287
4288
  		memcg->swappiness = val;
  	else
  		vm_swappiness = val;
068b38c1f   Li Zefan   memcg: fix a race...
4289

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4290
4291
  	return 0;
  }
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4292
4293
4294
4295
4296
4297
4298
4299
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
  	struct mem_cgroup_threshold_ary *t;
  	u64 usage;
  	int i;
  
  	rcu_read_lock();
  	if (!swap)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4300
  		t = rcu_dereference(memcg->thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4301
  	else
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4302
  		t = rcu_dereference(memcg->memsw_thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4303
4304
4305
  
  	if (!t)
  		goto unlock;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4306
  	usage = mem_cgroup_usage(memcg, swap);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4307
4308
  
  	/*
748dad36d   Sha Zhengju   memcg: make thres...
4309
  	 * current_threshold points to threshold just below or equal to usage.
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4310
4311
4312
  	 * If it's not true, a threshold was crossed after last
  	 * call of __mem_cgroup_threshold().
  	 */
5407a5625   Phil Carmody   mm: remove unnece...
4313
  	i = t->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
  
  	/*
  	 * Iterate backward over array of thresholds starting from
  	 * current_threshold and check if a threshold is crossed.
  	 * If none of thresholds below usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* i = current_threshold + 1 */
  	i++;
  
  	/*
  	 * Iterate forward over array of thresholds starting from
  	 * current_threshold+1 and check if a threshold is crossed.
  	 * If none of thresholds above usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* Update current_threshold */
5407a5625   Phil Carmody   mm: remove unnece...
4337
  	t->current_threshold = i - 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4338
4339
4340
4341
4342
4343
  unlock:
  	rcu_read_unlock();
  }
  
  static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  {
ad4ca5f4b   Kirill A. Shutemov   memcg: fix thresh...
4344
4345
4346
4347
4348
4349
4350
  	while (memcg) {
  		__mem_cgroup_threshold(memcg, false);
  		if (do_swap_account)
  			__mem_cgroup_threshold(memcg, true);
  
  		memcg = parent_mem_cgroup(memcg);
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4351
4352
4353
4354
4355
4356
  }
  
  static int compare_thresholds(const void *a, const void *b)
  {
  	const struct mem_cgroup_threshold *_a = a;
  	const struct mem_cgroup_threshold *_b = b;
2bff24a37   Greg Thelen   memcg: fix multip...
4357
4358
4359
4360
4361
4362
4363
  	if (_a->threshold > _b->threshold)
  		return 1;
  
  	if (_a->threshold < _b->threshold)
  		return -1;
  
  	return 0;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4364
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4365
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4366
4367
  {
  	struct mem_cgroup_eventfd_list *ev;
2bcf2e92c   Michal Hocko   memcg: oom_notify...
4368
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
4369
  	list_for_each_entry(ev, &memcg->oom_notify, list)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4370
  		eventfd_signal(ev->eventfd, 1);
2bcf2e92c   Michal Hocko   memcg: oom_notify...
4371
4372
  
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4373
4374
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4375
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4376
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
4377
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4378
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
4379
  		mem_cgroup_oom_notify_cb(iter);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4380
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
4381
  static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4382
  	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4383
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4384
4385
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4386
  	u64 threshold, usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4387
  	int i, size, ret;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4388
4389
4390
4391
4392
4393
  
  	ret = res_counter_memparse_write_strategy(args, &threshold);
  	if (ret)
  		return ret;
  
  	mutex_lock(&memcg->thresholds_lock);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4394

05b843012   Johannes Weiner   mm: memcontrol: u...
4395
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4396
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4397
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
4398
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4399
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4400
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
4401
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4402
  		BUG();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4403
  	/* Check if a threshold crossed before adding a new one */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4404
  	if (thresholds->primary)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4405
  		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4406
  	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4407
4408
  
  	/* Allocate memory for new array of thresholds */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4409
  	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4410
  			GFP_KERNEL);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4411
  	if (!new) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4412
4413
4414
  		ret = -ENOMEM;
  		goto unlock;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4415
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4416
4417
  
  	/* Copy thresholds (if any) to new array */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4418
4419
  	if (thresholds->primary) {
  		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4420
  				sizeof(struct mem_cgroup_threshold));
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4421
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4422
  	/* Add new threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4423
4424
  	new->entries[size - 1].eventfd = eventfd;
  	new->entries[size - 1].threshold = threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4425
4426
  
  	/* Sort thresholds. Registering of new threshold isn't time-critical */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4427
  	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4428
4429
4430
  			compare_thresholds, NULL);
  
  	/* Find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4431
  	new->current_threshold = -1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4432
  	for (i = 0; i < size; i++) {
748dad36d   Sha Zhengju   memcg: make thres...
4433
  		if (new->entries[i].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4434
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4435
4436
  			 * new->current_threshold will not be used until
  			 * rcu_assign_pointer(), so it's safe to increment
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4437
4438
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4439
  			++new->current_threshold;
748dad36d   Sha Zhengju   memcg: make thres...
4440
4441
  		} else
  			break;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4442
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4443
4444
4445
4446
4447
  	/* Free old spare buffer and save old primary buffer as spare */
  	kfree(thresholds->spare);
  	thresholds->spare = thresholds->primary;
  
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4448

907860ed3   Kirill A. Shutemov   cgroups: make cft...
4449
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4450
  	synchronize_rcu();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4451
4452
4453
4454
4455
  unlock:
  	mutex_unlock(&memcg->thresholds_lock);
  
  	return ret;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
4456
  static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4457
4458
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
4459
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
4460
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
4461
  static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4462
4463
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
4464
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
4465
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
4466
  static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4467
  	struct eventfd_ctx *eventfd, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4468
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4469
4470
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4471
  	u64 usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4472
  	int i, j, size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4473
4474
  
  	mutex_lock(&memcg->thresholds_lock);
05b843012   Johannes Weiner   mm: memcontrol: u...
4475
4476
  
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4477
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4478
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
4479
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4480
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4481
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
4482
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4483
  		BUG();
371528cae   Anton Vorontsov   mm: memcg: Correc...
4484
4485
  	if (!thresholds->primary)
  		goto unlock;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4486
4487
4488
4489
  	/* Check if a threshold crossed before removing */
  	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
  
  	/* Calculate new number of threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4490
4491
4492
  	size = 0;
  	for (i = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd != eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4493
4494
  			size++;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4495
  	new = thresholds->spare;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4496

2e72b6347   Kirill A. Shutemov   memcg: implement ...
4497
4498
  	/* Set thresholds array to NULL if we don't have thresholds */
  	if (!size) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4499
4500
  		kfree(new);
  		new = NULL;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4501
  		goto swap_buffers;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4502
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4503
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4504
4505
  
  	/* Copy thresholds and find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4506
4507
4508
  	new->current_threshold = -1;
  	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd == eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4509
  			continue;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4510
  		new->entries[j] = thresholds->primary->entries[i];
748dad36d   Sha Zhengju   memcg: make thres...
4511
  		if (new->entries[j].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4512
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4513
  			 * new->current_threshold will not be used
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4514
4515
4516
  			 * until rcu_assign_pointer(), so it's safe to increment
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4517
  			++new->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4518
4519
4520
  		}
  		j++;
  	}
907860ed3   Kirill A. Shutemov   cgroups: make cft...
4521
  swap_buffers:
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4522
4523
  	/* Swap primary and spare array */
  	thresholds->spare = thresholds->primary;
8c7577637   Sha Zhengju   memcg: free spare...
4524
4525
4526
4527
4528
  	/* If all events are unregistered, free the spare array */
  	if (!new) {
  		kfree(thresholds->spare);
  		thresholds->spare = NULL;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
4529
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4530

907860ed3   Kirill A. Shutemov   cgroups: make cft...
4531
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4532
  	synchronize_rcu();
371528cae   Anton Vorontsov   mm: memcg: Correc...
4533
  unlock:
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4534
  	mutex_unlock(&memcg->thresholds_lock);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
4535
  }
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4536

59b6f8734   Tejun Heo   memcg: make cgrou...
4537
  static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4538
4539
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
4540
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
4541
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
4542
  static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4543
4544
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
4545
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
4546
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
4547
  static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4548
  	struct eventfd_ctx *eventfd, const char *args)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4549
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4550
  	struct mem_cgroup_eventfd_list *event;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4551

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4552
4553
4554
  	event = kmalloc(sizeof(*event),	GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
1af8efe96   Michal Hocko   memcg: change mem...
4555
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4556
4557
4558
4559
4560
  
  	event->eventfd = eventfd;
  	list_add(&event->list, &memcg->oom_notify);
  
  	/* already in OOM ? */
79dfdaccd   Michal Hocko   memcg: make oom_l...
4561
  	if (atomic_read(&memcg->under_oom))
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4562
  		eventfd_signal(eventfd, 1);
1af8efe96   Michal Hocko   memcg: change mem...
4563
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4564
4565
4566
  
  	return 0;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
4567
  static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
4568
  	struct eventfd_ctx *eventfd)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4569
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4570
  	struct mem_cgroup_eventfd_list *ev, *tmp;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4571

1af8efe96   Michal Hocko   memcg: change mem...
4572
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4573

c0ff4b854   Raghavendra K T   memcg: rename mem...
4574
  	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4575
4576
4577
4578
4579
  		if (ev->eventfd == eventfd) {
  			list_del(&ev->list);
  			kfree(ev);
  		}
  	}
1af8efe96   Michal Hocko   memcg: change mem...
4580
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4581
  }
2da8ca822   Tejun Heo   cgroup: replace c...
4582
  static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4583
  {
2da8ca822   Tejun Heo   cgroup: replace c...
4584
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4585

791badbdb   Tejun Heo   memcg: convert aw...
4586
4587
4588
4589
  	seq_printf(sf, "oom_kill_disable %d
  ", memcg->oom_kill_disable);
  	seq_printf(sf, "under_oom %d
  ", (bool)atomic_read(&memcg->under_oom));
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4590
4591
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
4592
  static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4593
4594
  	struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
4595
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4596
4597
  
  	/* cannot set to root cgroup and only 0 and 1 are allowed */
14208b0ec   Linus Torvalds   Merge branch 'for...
4598
  	if (!css->parent || !((val == 0) || (val == 1)))
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4599
  		return -EINVAL;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4600
  	memcg->oom_kill_disable = val;
4d845ebf4   KAMEZAWA Hiroyuki   memcg: fix wake u...
4601
  	if (!val)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4602
  		memcg_oom_recover(memcg);
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
4603

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4604
4605
  	return 0;
  }
c255a4580   Andrew Morton   memcg: rename con...
4606
  #ifdef CONFIG_MEMCG_KMEM
cbe128e34   Glauber Costa   cgroup: get rid o...
4607
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
4608
  {
55007d849   Glauber Costa   memcg: allocate m...
4609
  	int ret;
2633d7a02   Glauber Costa   slab/slub: consid...
4610
  	memcg->kmemcg_id = -1;
55007d849   Glauber Costa   memcg: allocate m...
4611
4612
4613
  	ret = memcg_propagate_kmem(memcg);
  	if (ret)
  		return ret;
2633d7a02   Glauber Costa   slab/slub: consid...
4614

1d62e4365   Glauber Costa   cgroup: pass stru...
4615
  	return mem_cgroup_sockets_init(memcg, ss);
573b400d0   Michel Lespinasse   mm/memcontrol.c: ...
4616
  }
e5671dfae   Glauber Costa   Basic kernel memo...
4617

10d5ebf40   Li Zefan   memcg: use css_ge...
4618
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
d1a4c0b37   Glauber Costa   tcp memory pressu...
4619
  {
1d62e4365   Glauber Costa   cgroup: pass stru...
4620
  	mem_cgroup_sockets_destroy(memcg);
10d5ebf40   Li Zefan   memcg: use css_ge...
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
  }
  
  static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
  {
  	if (!memcg_kmem_is_active(memcg))
  		return;
  
  	/*
  	 * kmem charges can outlive the cgroup. In the case of slab
  	 * pages, for instance, a page contain objects from various
  	 * processes. As we prevent from taking a reference for every
  	 * such allocation we have to be careful when doing uncharge
  	 * (see memcg_uncharge_kmem) and here during offlining.
  	 *
  	 * The idea is that that only the _last_ uncharge which sees
  	 * the dead memcg will drop the last reference. An additional
  	 * reference is taken here before the group is marked dead
  	 * which is then paired with css_put during uncharge resp. here.
  	 *
  	 * Although this might sound strange as this path is called from
ec903c0c8   Tejun Heo   cgroup: rename cs...
4641
4642
4643
4644
  	 * css_offline() when the referencemight have dropped down to 0 and
  	 * shouldn't be incremented anymore (css_tryget_online() would
  	 * fail) we do not have other options because of the kmem
  	 * allocations lifetime.
10d5ebf40   Li Zefan   memcg: use css_ge...
4645
4646
  	 */
  	css_get(&memcg->css);
7de37682b   Glauber Costa   memcg: kmem accou...
4647
4648
4649
4650
4651
  
  	memcg_kmem_mark_dead(memcg);
  
  	if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
  		return;
7de37682b   Glauber Costa   memcg: kmem accou...
4652
  	if (memcg_kmem_test_and_clear_dead(memcg))
10d5ebf40   Li Zefan   memcg: use css_ge...
4653
  		css_put(&memcg->css);
d1a4c0b37   Glauber Costa   tcp memory pressu...
4654
  }
e5671dfae   Glauber Costa   Basic kernel memo...
4655
  #else
cbe128e34   Glauber Costa   cgroup: get rid o...
4656
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
4657
4658
4659
  {
  	return 0;
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
4660

10d5ebf40   Li Zefan   memcg: use css_ge...
4661
4662
4663
4664
4665
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
  {
  }
  
  static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
d1a4c0b37   Glauber Costa   tcp memory pressu...
4666
4667
  {
  }
e5671dfae   Glauber Costa   Basic kernel memo...
4668
  #endif
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4669
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
   * DO NOT USE IN NEW FILES.
   *
   * "cgroup.event_control" implementation.
   *
   * This is way over-engineered.  It tries to support fully configurable
   * events for each user.  Such level of flexibility is completely
   * unnecessary especially in the light of the planned unified hierarchy.
   *
   * Please deprecate this and replace with something simpler if at all
   * possible.
   */
  
  /*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4683
4684
4685
4686
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
4687
  static void memcg_event_remove(struct work_struct *work)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4688
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
4689
4690
  	struct mem_cgroup_event *event =
  		container_of(work, struct mem_cgroup_event, remove);
59b6f8734   Tejun Heo   memcg: make cgrou...
4691
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4692
4693
  
  	remove_wait_queue(event->wqh, &event->wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
4694
  	event->unregister_event(memcg, event->eventfd);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4695
4696
4697
4698
4699
4700
  
  	/* Notify userspace the event is going away. */
  	eventfd_signal(event->eventfd, 1);
  
  	eventfd_ctx_put(event->eventfd);
  	kfree(event);
59b6f8734   Tejun Heo   memcg: make cgrou...
4701
  	css_put(&memcg->css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4702
4703
4704
4705
4706
4707
4708
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
4709
4710
  static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
  			    int sync, void *key)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4711
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
4712
4713
  	struct mem_cgroup_event *event =
  		container_of(wait, struct mem_cgroup_event, wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
4714
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
  		/*
  		 * If the event has been detached at cgroup removal, we
  		 * can simply return knowing the other side will cleanup
  		 * for us.
  		 *
  		 * We can't race against event freeing since the other
  		 * side will require wqh->lock via remove_wait_queue(),
  		 * which we hold.
  		 */
fba948078   Tejun Heo   cgroup, memcg: mo...
4727
  		spin_lock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4728
4729
4730
4731
4732
4733
4734
4735
  		if (!list_empty(&event->list)) {
  			list_del_init(&event->list);
  			/*
  			 * We are in atomic context, but cgroup_event_remove()
  			 * may sleep, so we have to call it in workqueue.
  			 */
  			schedule_work(&event->remove);
  		}
fba948078   Tejun Heo   cgroup, memcg: mo...
4736
  		spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4737
4738
4739
4740
  	}
  
  	return 0;
  }
3bc942f37   Tejun Heo   memcg: rename cgr...
4741
  static void memcg_event_ptable_queue_proc(struct file *file,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4742
4743
  		wait_queue_head_t *wqh, poll_table *pt)
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
4744
4745
  	struct mem_cgroup_event *event =
  		container_of(pt, struct mem_cgroup_event, pt);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4746
4747
4748
4749
4750
4751
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
4752
4753
   * DO NOT USE IN NEW FILES.
   *
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4754
4755
4756
4757
4758
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
451af504d   Tejun Heo   cgroup: replace c...
4759
4760
  static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
  					 char *buf, size_t nbytes, loff_t off)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4761
  {
451af504d   Tejun Heo   cgroup: replace c...
4762
  	struct cgroup_subsys_state *css = of_css(of);
fba948078   Tejun Heo   cgroup, memcg: mo...
4763
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
4764
  	struct mem_cgroup_event *event;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4765
4766
4767
4768
  	struct cgroup_subsys_state *cfile_css;
  	unsigned int efd, cfd;
  	struct fd efile;
  	struct fd cfile;
fba948078   Tejun Heo   cgroup, memcg: mo...
4769
  	const char *name;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4770
4771
  	char *endp;
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
4772
4773
4774
  	buf = strstrip(buf);
  
  	efd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4775
4776
  	if (*endp != ' ')
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
4777
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4778

451af504d   Tejun Heo   cgroup: replace c...
4779
  	cfd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4780
4781
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
4782
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4783
4784
4785
4786
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
59b6f8734   Tejun Heo   memcg: make cgrou...
4787
  	event->memcg = memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4788
  	INIT_LIST_HEAD(&event->list);
3bc942f37   Tejun Heo   memcg: rename cgr...
4789
4790
4791
  	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
  	INIT_WORK(&event->remove, memcg_event_remove);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
  
  	efile = fdget(efd);
  	if (!efile.file) {
  		ret = -EBADF;
  		goto out_kfree;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile.file);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto out_put_efile;
  	}
  
  	cfile = fdget(cfd);
  	if (!cfile.file) {
  		ret = -EBADF;
  		goto out_put_eventfd;
  	}
  
  	/* the process need read permission on control file */
  	/* AV: shouldn't we check that it's been opened for read instead? */
  	ret = inode_permission(file_inode(cfile.file), MAY_READ);
  	if (ret < 0)
  		goto out_put_cfile;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4816
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
4817
4818
4819
4820
  	 * Determine the event callbacks and set them in @event.  This used
  	 * to be done via struct cftype but cgroup core no longer knows
  	 * about these events.  The following is crude but the whole thing
  	 * is for compatibility anyway.
3bc942f37   Tejun Heo   memcg: rename cgr...
4821
4822
  	 *
  	 * DO NOT ADD NEW FILES.
fba948078   Tejun Heo   cgroup, memcg: mo...
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
  	 */
  	name = cfile.file->f_dentry->d_name.name;
  
  	if (!strcmp(name, "memory.usage_in_bytes")) {
  		event->register_event = mem_cgroup_usage_register_event;
  		event->unregister_event = mem_cgroup_usage_unregister_event;
  	} else if (!strcmp(name, "memory.oom_control")) {
  		event->register_event = mem_cgroup_oom_register_event;
  		event->unregister_event = mem_cgroup_oom_unregister_event;
  	} else if (!strcmp(name, "memory.pressure_level")) {
  		event->register_event = vmpressure_register_event;
  		event->unregister_event = vmpressure_unregister_event;
  	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
347c4a874   Tejun Heo   memcg: remove cgr...
4836
4837
  		event->register_event = memsw_cgroup_usage_register_event;
  		event->unregister_event = memsw_cgroup_usage_unregister_event;
fba948078   Tejun Heo   cgroup, memcg: mo...
4838
4839
4840
4841
4842
4843
  	} else {
  		ret = -EINVAL;
  		goto out_put_cfile;
  	}
  
  	/*
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
4844
4845
4846
  	 * Verify @cfile should belong to @css.  Also, remaining events are
  	 * automatically removed on cgroup destruction but the removal is
  	 * asynchronous, so take an extra ref on @css.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4847
  	 */
ec903c0c8   Tejun Heo   cgroup: rename cs...
4848
4849
  	cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
  					       &memory_cgrp_subsys);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4850
  	ret = -EINVAL;
5a17f543e   Tejun Heo   cgroup: improve c...
4851
  	if (IS_ERR(cfile_css))
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4852
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
4853
4854
  	if (cfile_css != css) {
  		css_put(cfile_css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4855
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
4856
  	}
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4857

451af504d   Tejun Heo   cgroup: replace c...
4858
  	ret = event->register_event(memcg, event->eventfd, buf);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4859
4860
4861
4862
  	if (ret)
  		goto out_put_css;
  
  	efile.file->f_op->poll(efile.file, &event->pt);
fba948078   Tejun Heo   cgroup, memcg: mo...
4863
4864
4865
  	spin_lock(&memcg->event_list_lock);
  	list_add(&event->list, &memcg->event_list);
  	spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4866
4867
4868
  
  	fdput(cfile);
  	fdput(efile);
451af504d   Tejun Heo   cgroup: replace c...
4869
  	return nbytes;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4870
4871
  
  out_put_css:
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
4872
  	css_put(css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
  out_put_cfile:
  	fdput(cfile);
  out_put_eventfd:
  	eventfd_ctx_put(event->eventfd);
  out_put_efile:
  	fdput(efile);
  out_kfree:
  	kfree(event);
  
  	return ret;
  }
8cdea7c05   Balbir Singh   Memory controller...
4884
4885
  static struct cftype mem_cgroup_files[] = {
  	{
0eea10301   Balbir Singh   Memory controller...
4886
  		.name = "usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4887
  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
4888
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4889
4890
  	},
  	{
c84872e16   Pavel Emelyanov   memcgroup: add th...
4891
  		.name = "max_usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4892
  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
4893
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4894
  		.read_u64 = mem_cgroup_read_u64,
c84872e16   Pavel Emelyanov   memcgroup: add th...
4895
4896
  	},
  	{
0eea10301   Balbir Singh   Memory controller...
4897
  		.name = "limit_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4898
  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4899
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4900
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4901
4902
  	},
  	{
296c81d89   Balbir Singh   memory controller...
4903
4904
  		.name = "soft_limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4905
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4906
  		.read_u64 = mem_cgroup_read_u64,
296c81d89   Balbir Singh   memory controller...
4907
4908
  	},
  	{
8cdea7c05   Balbir Singh   Memory controller...
4909
  		.name = "failcnt",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4910
  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
4911
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4912
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4913
  	},
8697d3319   Balbir Singh   Memory controller...
4914
  	{
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4915
  		.name = "stat",
2da8ca822   Tejun Heo   cgroup: replace c...
4916
  		.seq_show = memcg_stat_show,
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4917
  	},
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4918
4919
  	{
  		.name = "force_empty",
6770c64e5   Tejun Heo   cgroup: replace c...
4920
  		.write = mem_cgroup_force_empty_write,
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4921
  	},
18f59ea7d   Balbir Singh   memcg: memory cgr...
4922
4923
4924
4925
4926
  	{
  		.name = "use_hierarchy",
  		.write_u64 = mem_cgroup_hierarchy_write,
  		.read_u64 = mem_cgroup_hierarchy_read,
  	},
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4927
  	{
3bc942f37   Tejun Heo   memcg: rename cgr...
4928
  		.name = "cgroup.event_control",		/* XXX: for compat */
451af504d   Tejun Heo   cgroup: replace c...
4929
  		.write = memcg_write_event_control,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4930
4931
4932
4933
  		.flags = CFTYPE_NO_PREFIX,
  		.mode = S_IWUGO,
  	},
  	{
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4934
4935
4936
4937
  		.name = "swappiness",
  		.read_u64 = mem_cgroup_swappiness_read,
  		.write_u64 = mem_cgroup_swappiness_write,
  	},
7dc74be03   Daisuke Nishimura   memcg: add interf...
4938
4939
4940
4941
4942
  	{
  		.name = "move_charge_at_immigrate",
  		.read_u64 = mem_cgroup_move_charge_read,
  		.write_u64 = mem_cgroup_move_charge_write,
  	},
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4943
4944
  	{
  		.name = "oom_control",
2da8ca822   Tejun Heo   cgroup: replace c...
4945
  		.seq_show = mem_cgroup_oom_control_read,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4946
  		.write_u64 = mem_cgroup_oom_control_write,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4947
4948
  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  	},
70ddf637e   Anton Vorontsov   memcg: add memory...
4949
4950
  	{
  		.name = "pressure_level",
70ddf637e   Anton Vorontsov   memcg: add memory...
4951
  	},
406eb0c9b   Ying Han   memcg: add memory...
4952
4953
4954
  #ifdef CONFIG_NUMA
  	{
  		.name = "numa_stat",
2da8ca822   Tejun Heo   cgroup: replace c...
4955
  		.seq_show = memcg_numa_stat_show,
406eb0c9b   Ying Han   memcg: add memory...
4956
4957
  	},
  #endif
510fc4e11   Glauber Costa   memcg: kmem accou...
4958
4959
4960
4961
  #ifdef CONFIG_MEMCG_KMEM
  	{
  		.name = "kmem.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4962
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4963
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4964
4965
4966
4967
  	},
  	{
  		.name = "kmem.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
4968
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4969
4970
4971
4972
  	},
  	{
  		.name = "kmem.failcnt",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
4973
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4974
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4975
4976
4977
4978
  	},
  	{
  		.name = "kmem.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
4979
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4980
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4981
  	},
749c54151   Glauber Costa   memcg: aggregate ...
4982
4983
4984
  #ifdef CONFIG_SLABINFO
  	{
  		.name = "kmem.slabinfo",
2da8ca822   Tejun Heo   cgroup: replace c...
4985
  		.seq_show = mem_cgroup_slabinfo_read,
749c54151   Glauber Costa   memcg: aggregate ...
4986
4987
  	},
  #endif
510fc4e11   Glauber Costa   memcg: kmem accou...
4988
  #endif
6bc103498   Tejun Heo   cgroup: convert m...
4989
  	{ },	/* terminate */
af36f906c   Tejun Heo   memcg: always cre...
4990
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4991

2d11085e4   Michal Hocko   memcg: do not cre...
4992
4993
4994
4995
4996
  #ifdef CONFIG_MEMCG_SWAP
  static struct cftype memsw_cgroup_files[] = {
  	{
  		.name = "memsw.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
4997
  		.read_u64 = mem_cgroup_read_u64,
2d11085e4   Michal Hocko   memcg: do not cre...
4998
4999
5000
5001
  	},
  	{
  		.name = "memsw.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
5002
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
5003
  		.read_u64 = mem_cgroup_read_u64,
2d11085e4   Michal Hocko   memcg: do not cre...
5004
5005
5006
5007
  	},
  	{
  		.name = "memsw.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
5008
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
5009
  		.read_u64 = mem_cgroup_read_u64,
2d11085e4   Michal Hocko   memcg: do not cre...
5010
5011
5012
5013
  	},
  	{
  		.name = "memsw.failcnt",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
5014
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
5015
  		.read_u64 = mem_cgroup_read_u64,
2d11085e4   Michal Hocko   memcg: do not cre...
5016
5017
5018
5019
  	},
  	{ },	/* terminate */
  };
  #endif
c0ff4b854   Raghavendra K T   memcg: rename mem...
5020
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5021
5022
  {
  	struct mem_cgroup_per_node *pn;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
5023
  	struct mem_cgroup_per_zone *mz;
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
5024
  	int zone, tmp = node;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
5025
5026
5027
5028
5029
5030
5031
5032
  	/*
  	 * This routine is called against possible nodes.
  	 * But it's BUG to call kmalloc() against offline node.
  	 *
  	 * TODO: this routine can waste much memory for nodes which will
  	 *       never be onlined. It's better to use memory hotplug callback
  	 *       function.
  	 */
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
5033
5034
  	if (!node_state(node, N_NORMAL_MEMORY))
  		tmp = -1;
17295c88a   Jesper Juhl   memcg: use [kv]za...
5035
  	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5036
5037
  	if (!pn)
  		return 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
5038

1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
5039
5040
  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  		mz = &pn->zoneinfo[zone];
bea8c150a   Hugh Dickins   memcg: fix hotplu...
5041
  		lruvec_init(&mz->lruvec);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
5042
5043
  		mz->usage_in_excess = 0;
  		mz->on_tree = false;
d79154bb5   Hugh Dickins   memcg: replace me...
5044
  		mz->memcg = memcg;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
5045
  	}
54f72fe02   Johannes Weiner   memcg: clean up m...
5046
  	memcg->nodeinfo[node] = pn;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5047
5048
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
5049
  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
5050
  {
54f72fe02   Johannes Weiner   memcg: clean up m...
5051
  	kfree(memcg->nodeinfo[node]);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
5052
  }
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
5053
5054
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
5055
  	struct mem_cgroup *memcg;
8ff69e2c8   Vladimir Davydov   memcg: do not use...
5056
  	size_t size;
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
5057

8ff69e2c8   Vladimir Davydov   memcg: do not use...
5058
5059
  	size = sizeof(struct mem_cgroup);
  	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
5060

8ff69e2c8   Vladimir Davydov   memcg: do not use...
5061
  	memcg = kzalloc(size, GFP_KERNEL);
d79154bb5   Hugh Dickins   memcg: replace me...
5062
  	if (!memcg)
e7bbcdf37   Dan Carpenter   memcontrol: fix p...
5063
  		return NULL;
d79154bb5   Hugh Dickins   memcg: replace me...
5064
5065
  	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
  	if (!memcg->stat)
d2e61b8dc   Dan Carpenter   memcg: null deref...
5066
  		goto out_free;
d79154bb5   Hugh Dickins   memcg: replace me...
5067
5068
  	spin_lock_init(&memcg->pcp_counter_lock);
  	return memcg;
d2e61b8dc   Dan Carpenter   memcg: null deref...
5069
5070
  
  out_free:
8ff69e2c8   Vladimir Davydov   memcg: do not use...
5071
  	kfree(memcg);
d2e61b8dc   Dan Carpenter   memcg: null deref...
5072
  	return NULL;
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
5073
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5074
  /*
c8b2a36fb   Glauber Costa   memcg: execute th...
5075
5076
5077
5078
5079
5080
5081
5082
   * At destroying mem_cgroup, references from swap_cgroup can remain.
   * (scanning all at force_empty is too costly...)
   *
   * Instead of clearing all references at force_empty, we remember
   * the number of reference from swap_cgroup and free mem_cgroup when
   * it goes down to 0.
   *
   * Removal of cgroup itself succeeds regardless of refs from swap.
59927fb98   Hugh Dickins   memcg: free mem_c...
5083
   */
c8b2a36fb   Glauber Costa   memcg: execute th...
5084
5085
  
  static void __mem_cgroup_free(struct mem_cgroup *memcg)
59927fb98   Hugh Dickins   memcg: free mem_c...
5086
  {
c8b2a36fb   Glauber Costa   memcg: execute th...
5087
  	int node;
59927fb98   Hugh Dickins   memcg: free mem_c...
5088

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
5089
  	mem_cgroup_remove_from_trees(memcg);
c8b2a36fb   Glauber Costa   memcg: execute th...
5090
5091
5092
5093
5094
  
  	for_each_node(node)
  		free_mem_cgroup_per_zone_info(memcg, node);
  
  	free_percpu(memcg->stat);
3f1346193   Glauber Costa   memcg: decrement ...
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
  	/*
  	 * We need to make sure that (at least for now), the jump label
  	 * destruction code runs outside of the cgroup lock. This is because
  	 * get_online_cpus(), which is called from the static_branch update,
  	 * can't be called inside the cgroup_lock. cpusets are the ones
  	 * enforcing this dependency, so if they ever change, we might as well.
  	 *
  	 * schedule_work() will guarantee this happens. Be careful if you need
  	 * to move this code around, and make sure it is outside
  	 * the cgroup_lock.
  	 */
a8964b9b8   Glauber Costa   memcg: use static...
5106
  	disarm_static_keys(memcg);
8ff69e2c8   Vladimir Davydov   memcg: do not use...
5107
  	kfree(memcg);
59927fb98   Hugh Dickins   memcg: free mem_c...
5108
  }
3afe36b1f   Glauber Costa   memcg: always fre...
5109

7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
5110
5111
5112
  /*
   * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
   */
e1aab161e   Glauber Costa   socket: initial c...
5113
  struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
5114
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
5115
  	if (!memcg->res.parent)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
5116
  		return NULL;
c0ff4b854   Raghavendra K T   memcg: rename mem...
5117
  	return mem_cgroup_from_res_counter(memcg->res.parent, res);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
5118
  }
e1aab161e   Glauber Costa   socket: initial c...
5119
  EXPORT_SYMBOL(parent_mem_cgroup);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
5120

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
  static void __init mem_cgroup_soft_limit_tree_init(void)
  {
  	struct mem_cgroup_tree_per_node *rtpn;
  	struct mem_cgroup_tree_per_zone *rtpz;
  	int tmp, node, zone;
  
  	for_each_node(node) {
  		tmp = node;
  		if (!node_state(node, N_NORMAL_MEMORY))
  			tmp = -1;
  		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
  		BUG_ON(!rtpn);
  
  		soft_limit_tree.rb_tree_per_node[node] = rtpn;
  
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  			rtpz = &rtpn->rb_tree_per_zone[zone];
  			rtpz->rb_root = RB_ROOT;
  			spin_lock_init(&rtpz->lock);
  		}
  	}
  }
0eb253e22   Li Zefan   memcg: fix sectio...
5143
  static struct cgroup_subsys_state * __ref
eb95419b0   Tejun Heo   cgroup: pass arou...
5144
  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8cdea7c05   Balbir Singh   Memory controller...
5145
  {
d142e3e66   Glauber Costa   memcg: split part...
5146
  	struct mem_cgroup *memcg;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
5147
  	long error = -ENOMEM;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5148
  	int node;
8cdea7c05   Balbir Singh   Memory controller...
5149

c0ff4b854   Raghavendra K T   memcg: rename mem...
5150
5151
  	memcg = mem_cgroup_alloc();
  	if (!memcg)
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
5152
  		return ERR_PTR(error);
78fb74669   Pavel Emelianov   Memory controller...
5153

3ed28fa10   Bob Liu   memcg: cleanup fo...
5154
  	for_each_node(node)
c0ff4b854   Raghavendra K T   memcg: rename mem...
5155
  		if (alloc_mem_cgroup_per_zone_info(memcg, node))
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5156
  			goto free_out;
f64c3f549   Balbir Singh   memory controller...
5157

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5158
  	/* root ? */
eb95419b0   Tejun Heo   cgroup: pass arou...
5159
  	if (parent_css == NULL) {
a41c58a66   Hillf Danton   memcg: keep root ...
5160
  		root_mem_cgroup = memcg;
d142e3e66   Glauber Costa   memcg: split part...
5161
5162
5163
  		res_counter_init(&memcg->res, NULL);
  		res_counter_init(&memcg->memsw, NULL);
  		res_counter_init(&memcg->kmem, NULL);
18f59ea7d   Balbir Singh   memcg: memory cgr...
5164
  	}
28dbc4b6a   Balbir Singh   memcg: memory cgr...
5165

d142e3e66   Glauber Costa   memcg: split part...
5166
5167
  	memcg->last_scanned_node = MAX_NUMNODES;
  	INIT_LIST_HEAD(&memcg->oom_notify);
d142e3e66   Glauber Costa   memcg: split part...
5168
5169
5170
  	memcg->move_charge_at_immigrate = 0;
  	mutex_init(&memcg->thresholds_lock);
  	spin_lock_init(&memcg->move_lock);
70ddf637e   Anton Vorontsov   memcg: add memory...
5171
  	vmpressure_init(&memcg->vmpressure);
fba948078   Tejun Heo   cgroup, memcg: mo...
5172
5173
  	INIT_LIST_HEAD(&memcg->event_list);
  	spin_lock_init(&memcg->event_list_lock);
d142e3e66   Glauber Costa   memcg: split part...
5174
5175
5176
5177
5178
5179
5180
5181
5182
  
  	return &memcg->css;
  
  free_out:
  	__mem_cgroup_free(memcg);
  	return ERR_PTR(error);
  }
  
  static int
eb95419b0   Tejun Heo   cgroup: pass arou...
5183
  mem_cgroup_css_online(struct cgroup_subsys_state *css)
d142e3e66   Glauber Costa   memcg: split part...
5184
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
5185
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5c9d535b8   Tejun Heo   cgroup: remove cs...
5186
  	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
5187
  	int ret;
d142e3e66   Glauber Costa   memcg: split part...
5188

15a4c835e   Tejun Heo   cgroup, memcg: im...
5189
  	if (css->id > MEM_CGROUP_ID_MAX)
4219b2da2   Li Zefan   memcg: fail to cr...
5190
  		return -ENOSPC;
638769869   Tejun Heo   cgroup: add css_p...
5191
  	if (!parent)
d142e3e66   Glauber Costa   memcg: split part...
5192
  		return 0;
0999821b1   Glauber Costa   memcg: replace cg...
5193
  	mutex_lock(&memcg_create_mutex);
d142e3e66   Glauber Costa   memcg: split part...
5194
5195
5196
5197
5198
5199
  
  	memcg->use_hierarchy = parent->use_hierarchy;
  	memcg->oom_kill_disable = parent->oom_kill_disable;
  	memcg->swappiness = mem_cgroup_swappiness(parent);
  
  	if (parent->use_hierarchy) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
5200
5201
  		res_counter_init(&memcg->res, &parent->res);
  		res_counter_init(&memcg->memsw, &parent->memsw);
510fc4e11   Glauber Costa   memcg: kmem accou...
5202
  		res_counter_init(&memcg->kmem, &parent->kmem);
55007d849   Glauber Costa   memcg: allocate m...
5203

7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
5204
  		/*
8d76a9797   Li Zefan   memcg: don't need...
5205
5206
  		 * No need to take a reference to the parent because cgroup
  		 * core guarantees its existence.
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
5207
  		 */
18f59ea7d   Balbir Singh   memcg: memory cgr...
5208
  	} else {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5209
5210
5211
  		res_counter_init(&memcg->res, NULL);
  		res_counter_init(&memcg->memsw, NULL);
  		res_counter_init(&memcg->kmem, NULL);
8c7f6edbd   Tejun Heo   cgroup: mark subs...
5212
5213
5214
5215
5216
  		/*
  		 * Deeper hierachy with use_hierarchy == false doesn't make
  		 * much sense so let cgroup subsystem know about this
  		 * unfortunate state in our controller.
  		 */
d142e3e66   Glauber Costa   memcg: split part...
5217
  		if (parent != root_mem_cgroup)
073219e99   Tejun Heo   cgroup: clean up ...
5218
  			memory_cgrp_subsys.broken_hierarchy = true;
18f59ea7d   Balbir Singh   memcg: memory cgr...
5219
  	}
0999821b1   Glauber Costa   memcg: replace cg...
5220
  	mutex_unlock(&memcg_create_mutex);
d64416377   Vladimir Davydov   memcg: rework mem...
5221

2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
  	ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
  	if (ret)
  		return ret;
  
  	/*
  	 * Make sure the memcg is initialized: mem_cgroup_iter()
  	 * orders reading memcg->initialized against its callers
  	 * reading the memcg members.
  	 */
  	smp_store_release(&memcg->initialized, 1);
  
  	return 0;
8cdea7c05   Balbir Singh   Memory controller...
5234
  }
5f5781619   Michal Hocko   memcg: relax memc...
5235
5236
5237
5238
5239
5240
5241
5242
  /*
   * Announce all parents that a group from their hierarchy is gone.
   */
  static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
  {
  	struct mem_cgroup *parent = memcg;
  
  	while ((parent = parent_mem_cgroup(parent)))
519ebea3b   Johannes Weiner   mm: memcontrol: f...
5243
  		mem_cgroup_iter_invalidate(parent);
5f5781619   Michal Hocko   memcg: relax memc...
5244
5245
5246
5247
5248
5249
  
  	/*
  	 * if the root memcg is not hierarchical we have to check it
  	 * explicitely.
  	 */
  	if (!root_mem_cgroup->use_hierarchy)
519ebea3b   Johannes Weiner   mm: memcontrol: f...
5250
  		mem_cgroup_iter_invalidate(root_mem_cgroup);
5f5781619   Michal Hocko   memcg: relax memc...
5251
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5252
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
5253
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
5254
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
5255
  	struct mem_cgroup_event *event, *tmp;
4fb1a86fb   Filipe Brandenburger   memcg: reparent c...
5256
  	struct cgroup_subsys_state *iter;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
5257
5258
5259
5260
5261
5262
  
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace.
  	 */
fba948078   Tejun Heo   cgroup, memcg: mo...
5263
5264
  	spin_lock(&memcg->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
5265
5266
5267
  		list_del_init(&event->list);
  		schedule_work(&event->remove);
  	}
fba948078   Tejun Heo   cgroup, memcg: mo...
5268
  	spin_unlock(&memcg->event_list_lock);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
5269

10d5ebf40   Li Zefan   memcg: use css_ge...
5270
  	kmem_cgroup_css_offline(memcg);
5f5781619   Michal Hocko   memcg: relax memc...
5271
  	mem_cgroup_invalidate_reclaim_iterators(memcg);
4fb1a86fb   Filipe Brandenburger   memcg: reparent c...
5272
5273
5274
5275
5276
5277
5278
  
  	/*
  	 * This requires that offlining is serialized.  Right now that is
  	 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
  	 */
  	css_for_each_descendant_post(iter, css)
  		mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
776ed0f03   Vladimir Davydov   memcg: cleanup km...
5279
  	memcg_unregister_all_caches(memcg);
33cb876e9   Michal Hocko   vmpressure: make ...
5280
  	vmpressure_cleanup(&memcg->vmpressure);
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
5281
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5282
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
8cdea7c05   Balbir Singh   Memory controller...
5283
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
5284
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
96f1c58d8   Johannes Weiner   mm: memcg: fix ra...
5285
5286
5287
  	/*
  	 * XXX: css_offline() would be where we should reparent all
  	 * memory to prepare the cgroup for destruction.  However,
ec903c0c8   Tejun Heo   cgroup: rename cs...
5288
  	 * memcg does not do css_tryget_online() and res_counter charging
96f1c58d8   Johannes Weiner   mm: memcg: fix ra...
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
  	 * under the same RCU lock region, which means that charging
  	 * could race with offlining.  Offlining only happens to
  	 * cgroups with no tasks in them but charges can show up
  	 * without any tasks from the swapin path when the target
  	 * memcg is looked up from the swapout record and not from the
  	 * current task as it usually is.  A race like this can leak
  	 * charges and put pages with stale cgroup pointers into
  	 * circulation:
  	 *
  	 * #0                        #1
  	 *                           lookup_swap_cgroup_id()
  	 *                           rcu_read_lock()
  	 *                           mem_cgroup_lookup()
ec903c0c8   Tejun Heo   cgroup: rename cs...
5302
  	 *                           css_tryget_online()
96f1c58d8   Johannes Weiner   mm: memcg: fix ra...
5303
  	 *                           rcu_read_unlock()
ec903c0c8   Tejun Heo   cgroup: rename cs...
5304
  	 * disable css_tryget_online()
96f1c58d8   Johannes Weiner   mm: memcg: fix ra...
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
  	 * call_rcu()
  	 *   offline_css()
  	 *     reparent_charges()
  	 *                           res_counter_charge()
  	 *                           css_put()
  	 *                             css_free()
  	 *                           pc->mem_cgroup = dead memcg
  	 *                           add page to lru
  	 *
  	 * The bulk of the charges are still moved in offline_css() to
  	 * avoid pinning a lot of pages in case a long-term reference
  	 * like a swapout record is deferring the css_free() to long
  	 * after offlining.  But this makes sure we catch any charges
  	 * made after offlining:
  	 */
  	mem_cgroup_reparent_charges(memcg);
c268e9946   Daisuke Nishimura   memcg: fix hierar...
5321

10d5ebf40   Li Zefan   memcg: use css_ge...
5322
  	memcg_destroy_kmem(memcg);
465939a1f   Li Zefan   memcg: don't need...
5323
  	__mem_cgroup_free(memcg);
8cdea7c05   Balbir Singh   Memory controller...
5324
  }
1ced953b1   Tejun Heo   blkcg, memcg: mak...
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
  /**
   * mem_cgroup_css_reset - reset the states of a mem_cgroup
   * @css: the target css
   *
   * Reset the states of the mem_cgroup associated with @css.  This is
   * invoked when the userland requests disabling on the default hierarchy
   * but the memcg is pinned through dependency.  The memcg should stop
   * applying policies and should revert to the vanilla state as it may be
   * made visible again.
   *
   * The current implementation only resets the essential configurations.
   * This needs to be expanded to cover all the visible parts.
   */
  static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
  	mem_cgroup_resize_limit(memcg, ULLONG_MAX);
  	mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX);
  	memcg_update_kmem_limit(memcg, ULLONG_MAX);
  	res_counter_set_soft_limit(&memcg->res, ULLONG_MAX);
  }
024914477   Daisuke Nishimura   memcg: move charg...
5347
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
5348
  /* Handlers for move charge at task migration. */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5349
  static int mem_cgroup_do_precharge(unsigned long count)
7dc74be03   Daisuke Nishimura   memcg: add interf...
5350
  {
05b843012   Johannes Weiner   mm: memcontrol: u...
5351
  	int ret;
9476db974   Johannes Weiner   mm: memcontrol: s...
5352
5353
  
  	/* Try a single bulk charge without reclaim first */
00501b531   Johannes Weiner   mm: memcontrol: r...
5354
  	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
9476db974   Johannes Weiner   mm: memcontrol: s...
5355
  	if (!ret) {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5356
  		mc.precharge += count;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5357
5358
  		return ret;
  	}
692e7c45d   Johannes Weiner   mm: memcontrol: c...
5359
  	if (ret == -EINTR) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5360
  		cancel_charge(root_mem_cgroup, count);
692e7c45d   Johannes Weiner   mm: memcontrol: c...
5361
5362
  		return ret;
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
5363
5364
  
  	/* Try charges one by one with reclaim */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5365
  	while (count--) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5366
  		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
9476db974   Johannes Weiner   mm: memcontrol: s...
5367
5368
5369
  		/*
  		 * In case of failure, any residual charges against
  		 * mc.to will be dropped by mem_cgroup_clear_mc()
692e7c45d   Johannes Weiner   mm: memcontrol: c...
5370
5371
  		 * later on.  However, cancel any charges that are
  		 * bypassed to root right away or they'll be lost.
9476db974   Johannes Weiner   mm: memcontrol: s...
5372
  		 */
692e7c45d   Johannes Weiner   mm: memcontrol: c...
5373
  		if (ret == -EINTR)
00501b531   Johannes Weiner   mm: memcontrol: r...
5374
  			cancel_charge(root_mem_cgroup, 1);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
5375
  		if (ret)
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
5376
  			return ret;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5377
  		mc.precharge++;
9476db974   Johannes Weiner   mm: memcontrol: s...
5378
  		cond_resched();
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5379
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
5380
  	return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5381
5382
5383
  }
  
  /**
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5384
   * get_mctgt_type - get target type of moving charge
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5385
5386
5387
   * @vma: the vma the pte to be checked belongs
   * @addr: the address corresponding to the pte to be checked
   * @ptent: the pte to be checked
024914477   Daisuke Nishimura   memcg: move charg...
5388
   * @target: the pointer the target page or swap ent will be stored(can be NULL)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5389
5390
5391
5392
5393
5394
   *
   * Returns
   *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
   *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
   *     move charge. if @target is not NULL, the page is stored in target->page
   *     with extra refcnt got(Callers should handle it).
024914477   Daisuke Nishimura   memcg: move charg...
5395
5396
5397
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5398
5399
5400
   *
   * Called with pte lock held.
   */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5401
5402
  union mc_target {
  	struct page	*page;
024914477   Daisuke Nishimura   memcg: move charg...
5403
  	swp_entry_t	ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5404
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5405
  enum mc_target_type {
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5406
  	MC_TARGET_NONE = 0,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5407
  	MC_TARGET_PAGE,
024914477   Daisuke Nishimura   memcg: move charg...
5408
  	MC_TARGET_SWAP,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5409
  };
90254a658   Daisuke Nishimura   memcg: clean up m...
5410
5411
  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
  						unsigned long addr, pte_t ptent)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5412
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
5413
  	struct page *page = vm_normal_page(vma, addr, ptent);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5414

90254a658   Daisuke Nishimura   memcg: clean up m...
5415
5416
5417
5418
  	if (!page || !page_mapped(page))
  		return NULL;
  	if (PageAnon(page)) {
  		/* we don't move shared anon */
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
5419
  		if (!move_anon())
90254a658   Daisuke Nishimura   memcg: clean up m...
5420
  			return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
5421
5422
  	} else if (!move_file())
  		/* we ignore mapcount for file pages */
90254a658   Daisuke Nishimura   memcg: clean up m...
5423
5424
5425
5426
5427
5428
  		return NULL;
  	if (!get_page_unless_zero(page))
  		return NULL;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
5429
  #ifdef CONFIG_SWAP
90254a658   Daisuke Nishimura   memcg: clean up m...
5430
5431
5432
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
5433
5434
5435
5436
5437
  	struct page *page = NULL;
  	swp_entry_t ent = pte_to_swp_entry(ptent);
  
  	if (!move_anon() || non_swap_entry(ent))
  		return NULL;
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
5438
5439
5440
5441
  	/*
  	 * Because lookup_swap_cache() updates some statistics counter,
  	 * we call find_get_page() with swapper_space directly.
  	 */
33806f06d   Shaohua Li   swap: make each s...
5442
  	page = find_get_page(swap_address_space(ent), ent.val);
90254a658   Daisuke Nishimura   memcg: clean up m...
5443
5444
5445
5446
5447
  	if (do_swap_account)
  		entry->val = ent.val;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
5448
5449
5450
5451
5452
5453
5454
  #else
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	return NULL;
  }
  #endif
90254a658   Daisuke Nishimura   memcg: clean up m...
5455

87946a722   Daisuke Nishimura   memcg: move charg...
5456
5457
5458
5459
  static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	struct page *page = NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
5460
5461
5462
5463
5464
5465
5466
  	struct address_space *mapping;
  	pgoff_t pgoff;
  
  	if (!vma->vm_file) /* anonymous vma */
  		return NULL;
  	if (!move_file())
  		return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
5467
5468
5469
5470
5471
5472
5473
  	mapping = vma->vm_file->f_mapping;
  	if (pte_none(ptent))
  		pgoff = linear_page_index(vma, addr);
  	else /* pte_file(ptent) is true */
  		pgoff = pte_to_pgoff(ptent);
  
  	/* page is moved even if it's not RSS of this task(page-faulted). */
aa3b18955   Hugh Dickins   tmpfs: convert me...
5474
5475
  #ifdef CONFIG_SWAP
  	/* shmem/tmpfs may report page out on swap: account for that too. */
139b6a6fb   Johannes Weiner   mm: filemap: upda...
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
  	if (shmem_mapping(mapping)) {
  		page = find_get_entry(mapping, pgoff);
  		if (radix_tree_exceptional_entry(page)) {
  			swp_entry_t swp = radix_to_swp_entry(page);
  			if (do_swap_account)
  				*entry = swp;
  			page = find_get_page(swap_address_space(swp), swp.val);
  		}
  	} else
  		page = find_get_page(mapping, pgoff);
  #else
  	page = find_get_page(mapping, pgoff);
aa3b18955   Hugh Dickins   tmpfs: convert me...
5488
  #endif
87946a722   Daisuke Nishimura   memcg: move charg...
5489
5490
  	return page;
  }
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5491
  static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
90254a658   Daisuke Nishimura   memcg: clean up m...
5492
5493
5494
5495
  		unsigned long addr, pte_t ptent, union mc_target *target)
  {
  	struct page *page = NULL;
  	struct page_cgroup *pc;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5496
  	enum mc_target_type ret = MC_TARGET_NONE;
90254a658   Daisuke Nishimura   memcg: clean up m...
5497
5498
5499
5500
5501
5502
  	swp_entry_t ent = { .val = 0 };
  
  	if (pte_present(ptent))
  		page = mc_handle_present_pte(vma, addr, ptent);
  	else if (is_swap_pte(ptent))
  		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
87946a722   Daisuke Nishimura   memcg: move charg...
5503
5504
  	else if (pte_none(ptent) || pte_file(ptent))
  		page = mc_handle_file_pte(vma, addr, ptent, &ent);
90254a658   Daisuke Nishimura   memcg: clean up m...
5505
5506
  
  	if (!page && !ent.val)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5507
  		return ret;
024914477   Daisuke Nishimura   memcg: move charg...
5508
5509
5510
  	if (page) {
  		pc = lookup_page_cgroup(page);
  		/*
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5511
5512
5513
  		 * Do only loose check w/o serialization.
  		 * mem_cgroup_move_account() checks the pc is valid or
  		 * not under LRU exclusion.
024914477   Daisuke Nishimura   memcg: move charg...
5514
5515
5516
5517
5518
5519
5520
5521
5522
  		 */
  		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
  			ret = MC_TARGET_PAGE;
  			if (target)
  				target->page = page;
  		}
  		if (!ret || !target)
  			put_page(page);
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
5523
5524
  	/* There is a swap entry and a page doesn't exist or isn't charged */
  	if (ent.val && !ret &&
34c00c319   Li Zefan   memcg: convert to...
5525
  	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
5526
5527
5528
  		ret = MC_TARGET_SWAP;
  		if (target)
  			target->ent = ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5529
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5530
5531
  	return ret;
  }
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * We don't consider swapping or file mapped pages because THP does not
   * support them for now.
   * Caller should make sure that pmd_trans_huge(pmd) is true.
   */
  static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	struct page *page = NULL;
  	struct page_cgroup *pc;
  	enum mc_target_type ret = MC_TARGET_NONE;
  
  	page = pmd_page(pmd);
309381fea   Sasha Levin   mm: dump page whe...
5546
  	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
  	if (!move_anon())
  		return ret;
  	pc = lookup_page_cgroup(page);
  	if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
  		ret = MC_TARGET_PAGE;
  		if (target) {
  			get_page(page);
  			target->page = page;
  		}
  	}
  	return ret;
  }
  #else
  static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	return MC_TARGET_NONE;
  }
  #endif
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5566
5567
5568
5569
5570
5571
5572
  static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  					unsigned long addr, unsigned long end,
  					struct mm_walk *walk)
  {
  	struct vm_area_struct *vma = walk->private;
  	pte_t *pte;
  	spinlock_t *ptl;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
5573
  	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5574
5575
  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
  			mc.precharge += HPAGE_PMD_NR;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
5576
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
5577
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5578
  	}
033193275   Dave Hansen   pagewalk: only sp...
5579

45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
5580
5581
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5582
5583
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; pte++, addr += PAGE_SIZE)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5584
  		if (get_mctgt_type(vma, addr, *pte, NULL))
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5585
5586
5587
  			mc.precharge++;	/* increment precharge temporarily */
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
7dc74be03   Daisuke Nishimura   memcg: add interf...
5588
5589
  	return 0;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5590
5591
5592
5593
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
  	unsigned long precharge;
  	struct vm_area_struct *vma;
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5594
  	down_read(&mm->mmap_sem);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5595
5596
5597
5598
5599
5600
5601
5602
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  		struct mm_walk mem_cgroup_count_precharge_walk = {
  			.pmd_entry = mem_cgroup_count_precharge_pte_range,
  			.mm = mm,
  			.private = vma,
  		};
  		if (is_vm_hugetlb_page(vma))
  			continue;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5603
5604
5605
  		walk_page_range(vma->vm_start, vma->vm_end,
  					&mem_cgroup_count_precharge_walk);
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5606
  	up_read(&mm->mmap_sem);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5607
5608
5609
5610
5611
5612
  
  	precharge = mc.precharge;
  	mc.precharge = 0;
  
  	return precharge;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5613
5614
  static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5615
5616
5617
5618
5619
  	unsigned long precharge = mem_cgroup_count_precharge(mm);
  
  	VM_BUG_ON(mc.moving_task);
  	mc.moving_task = current;
  	return mem_cgroup_do_precharge(precharge);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5620
  }
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5621
5622
  /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
  static void __mem_cgroup_clear_mc(void)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5623
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5624
5625
  	struct mem_cgroup *from = mc.from;
  	struct mem_cgroup *to = mc.to;
4050377b5   Li Zefan   memcg: use css_ge...
5626
  	int i;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5627

4ffef5fef   Daisuke Nishimura   memcg: move charg...
5628
  	/* we must uncharge all the leftover precharges from mc.to */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5629
  	if (mc.precharge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5630
  		cancel_charge(mc.to, mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5631
5632
5633
5634
5635
5636
5637
  		mc.precharge = 0;
  	}
  	/*
  	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
  	 * we must uncharge here.
  	 */
  	if (mc.moved_charge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5638
  		cancel_charge(mc.from, mc.moved_charge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5639
  		mc.moved_charge = 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5640
  	}
483c30b51   Daisuke Nishimura   memcg: improve pe...
5641
5642
  	/* we must fixup refcnts and charges */
  	if (mc.moved_swap) {
483c30b51   Daisuke Nishimura   memcg: improve pe...
5643
  		/* uncharge swap account from the old cgroup */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5644
5645
5646
  		if (!mem_cgroup_is_root(mc.from))
  			res_counter_uncharge(&mc.from->memsw,
  					     PAGE_SIZE * mc.moved_swap);
4050377b5   Li Zefan   memcg: use css_ge...
5647
5648
5649
  
  		for (i = 0; i < mc.moved_swap; i++)
  			css_put(&mc.from->css);
483c30b51   Daisuke Nishimura   memcg: improve pe...
5650

05b843012   Johannes Weiner   mm: memcontrol: u...
5651
5652
5653
5654
  		/*
  		 * we charged both to->res and to->memsw, so we should
  		 * uncharge to->res.
  		 */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5655
5656
5657
  		if (!mem_cgroup_is_root(mc.to))
  			res_counter_uncharge(&mc.to->res,
  					     PAGE_SIZE * mc.moved_swap);
4050377b5   Li Zefan   memcg: use css_ge...
5658
  		/* we've already done css_get(mc.to) */
483c30b51   Daisuke Nishimura   memcg: improve pe...
5659
5660
  		mc.moved_swap = 0;
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
  	memcg_oom_recover(from);
  	memcg_oom_recover(to);
  	wake_up_all(&mc.waitq);
  }
  
  static void mem_cgroup_clear_mc(void)
  {
  	struct mem_cgroup *from = mc.from;
  
  	/*
  	 * we must clear moving_task before waking up waiters at the end of
  	 * task migration.
  	 */
  	mc.moving_task = NULL;
  	__mem_cgroup_clear_mc();
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5676
  	spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5677
5678
  	mc.from = NULL;
  	mc.to = NULL;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5679
  	spin_unlock(&mc.lock);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
5680
  	mem_cgroup_end_move(from);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5681
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5682
  static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5683
  				 struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
5684
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
5685
  	struct task_struct *p = cgroup_taskset_first(tset);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5686
  	int ret = 0;
eb95419b0   Tejun Heo   cgroup: pass arou...
5687
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
ee5e8472b   Glauber Costa   memcg: prevent ch...
5688
  	unsigned long move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
5689

ee5e8472b   Glauber Costa   memcg: prevent ch...
5690
5691
5692
5693
5694
5695
5696
  	/*
  	 * We are now commited to this value whatever it is. Changes in this
  	 * tunable will only affect upcoming migrations, not the current one.
  	 * So we need to save it, and keep it going.
  	 */
  	move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
  	if (move_charge_at_immigrate) {
7dc74be03   Daisuke Nishimura   memcg: add interf...
5697
5698
  		struct mm_struct *mm;
  		struct mem_cgroup *from = mem_cgroup_from_task(p);
c0ff4b854   Raghavendra K T   memcg: rename mem...
5699
  		VM_BUG_ON(from == memcg);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5700
5701
5702
5703
  
  		mm = get_task_mm(p);
  		if (!mm)
  			return 0;
7dc74be03   Daisuke Nishimura   memcg: add interf...
5704
  		/* We move charges only when we move a owner of the mm */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5705
5706
5707
5708
  		if (mm->owner == p) {
  			VM_BUG_ON(mc.from);
  			VM_BUG_ON(mc.to);
  			VM_BUG_ON(mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5709
  			VM_BUG_ON(mc.moved_charge);
483c30b51   Daisuke Nishimura   memcg: improve pe...
5710
  			VM_BUG_ON(mc.moved_swap);
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
5711
  			mem_cgroup_start_move(from);
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5712
  			spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5713
  			mc.from = from;
c0ff4b854   Raghavendra K T   memcg: rename mem...
5714
  			mc.to = memcg;
ee5e8472b   Glauber Costa   memcg: prevent ch...
5715
  			mc.immigrate_flags = move_charge_at_immigrate;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
5716
  			spin_unlock(&mc.lock);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5717
  			/* We set mc.moving_task later */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5718
5719
5720
5721
  
  			ret = mem_cgroup_precharge_mc(mm);
  			if (ret)
  				mem_cgroup_clear_mc();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5722
5723
  		}
  		mmput(mm);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5724
5725
5726
  	}
  	return ret;
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5727
  static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5728
  				     struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
5729
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5730
  	mem_cgroup_clear_mc();
7dc74be03   Daisuke Nishimura   memcg: add interf...
5731
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5732
5733
5734
  static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
7dc74be03   Daisuke Nishimura   memcg: add interf...
5735
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5736
5737
5738
5739
  	int ret = 0;
  	struct vm_area_struct *vma = walk->private;
  	pte_t *pte;
  	spinlock_t *ptl;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5740
5741
5742
5743
  	enum mc_target_type target_type;
  	union mc_target target;
  	struct page *page;
  	struct page_cgroup *pc;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5744

12724850e   Naoya Horiguchi   memcg: avoid THP ...
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
  	/*
  	 * We don't take compound_lock() here but no race with splitting thp
  	 * happens because:
  	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
  	 *    under splitting, which means there's no concurrent thp split,
  	 *  - if another thread runs into split_huge_page() just after we
  	 *    entered this if-block, the thread must wait for page table lock
  	 *    to be unlocked in __split_huge_page_splitting(), where the main
  	 *    part of thp split is not executed yet.
  	 */
bf929152e   Kirill A. Shutemov   mm, thp: change p...
5755
  	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
62ade86ab   Hugh Dickins   memcg,thp: fix re...
5756
  		if (mc.precharge < HPAGE_PMD_NR) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
5757
  			spin_unlock(ptl);
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5758
5759
5760
5761
5762
5763
5764
5765
  			return 0;
  		}
  		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
  		if (target_type == MC_TARGET_PAGE) {
  			page = target.page;
  			if (!isolate_lru_page(page)) {
  				pc = lookup_page_cgroup(page);
  				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
5766
  							pc, mc.from, mc.to)) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5767
5768
5769
5770
5771
5772
5773
  					mc.precharge -= HPAGE_PMD_NR;
  					mc.moved_charge += HPAGE_PMD_NR;
  				}
  				putback_lru_page(page);
  			}
  			put_page(page);
  		}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
5774
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
5775
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
5776
  	}
45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
5777
5778
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5779
5780
5781
5782
  retry:
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; addr += PAGE_SIZE) {
  		pte_t ptent = *(pte++);
024914477   Daisuke Nishimura   memcg: move charg...
5783
  		swp_entry_t ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5784
5785
5786
  
  		if (!mc.precharge)
  			break;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5787
  		switch (get_mctgt_type(vma, addr, ptent, &target)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5788
5789
5790
5791
5792
  		case MC_TARGET_PAGE:
  			page = target.page;
  			if (isolate_lru_page(page))
  				goto put;
  			pc = lookup_page_cgroup(page);
7ec99d621   Johannes Weiner   memcg: unify char...
5793
  			if (!mem_cgroup_move_account(page, 1, pc,
2f3479b14   KAMEZAWA Hiroyuki   memcg: don't unch...
5794
  						     mc.from, mc.to)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5795
  				mc.precharge--;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5796
5797
  				/* we uncharge from mc.from later. */
  				mc.moved_charge++;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5798
5799
  			}
  			putback_lru_page(page);
8d32ff844   Naoya Horiguchi   memcg: clean up e...
5800
  put:			/* get_mctgt_type() gets the page */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5801
5802
  			put_page(page);
  			break;
024914477   Daisuke Nishimura   memcg: move charg...
5803
5804
  		case MC_TARGET_SWAP:
  			ent = target.ent;
e91cbb425   Hugh Dickins   memcg swap: mem_c...
5805
  			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
024914477   Daisuke Nishimura   memcg: move charg...
5806
  				mc.precharge--;
483c30b51   Daisuke Nishimura   memcg: improve pe...
5807
5808
5809
  				/* we fixup refcnts and charges later. */
  				mc.moved_swap++;
  			}
024914477   Daisuke Nishimura   memcg: move charg...
5810
  			break;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
  		default:
  			break;
  		}
  	}
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
  
  	if (addr != end) {
  		/*
  		 * We have consumed all precharges we got in can_attach().
  		 * We try charge one by one, but don't do any additional
  		 * charges to mc.to if we have failed in charge once in attach()
  		 * phase.
  		 */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5825
  		ret = mem_cgroup_do_precharge(1);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
  		if (!ret)
  			goto retry;
  	}
  
  	return ret;
  }
  
  static void mem_cgroup_move_charge(struct mm_struct *mm)
  {
  	struct vm_area_struct *vma;
  
  	lru_add_drain_all();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
  retry:
  	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  		/*
  		 * Someone who are holding the mmap_sem might be waiting in
  		 * waitq. So we cancel all extra charges, wake up all waiters,
  		 * and retry. Because we cancel precharges, we might not be able
  		 * to move enough charges, but moving charge is a best-effort
  		 * feature anyway, so it wouldn't be a big problem.
  		 */
  		__mem_cgroup_clear_mc();
  		cond_resched();
  		goto retry;
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5851
5852
5853
5854
5855
5856
5857
5858
5859
  	for (vma = mm->mmap; vma; vma = vma->vm_next) {
  		int ret;
  		struct mm_walk mem_cgroup_move_charge_walk = {
  			.pmd_entry = mem_cgroup_move_charge_pte_range,
  			.mm = mm,
  			.private = vma,
  		};
  		if (is_vm_hugetlb_page(vma))
  			continue;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5860
5861
5862
5863
5864
5865
5866
5867
5868
  		ret = walk_page_range(vma->vm_start, vma->vm_end,
  						&mem_cgroup_move_charge_walk);
  		if (ret)
  			/*
  			 * means we have consumed all precharges and failed in
  			 * doing additional charge. Just abandon here.
  			 */
  			break;
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5869
  	up_read(&mm->mmap_sem);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5870
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5871
  static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5872
  				 struct cgroup_taskset *tset)
67e465a77   Balbir Singh   Memory controller...
5873
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
5874
  	struct task_struct *p = cgroup_taskset_first(tset);
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5875
  	struct mm_struct *mm = get_task_mm(p);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5876

dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5877
  	if (mm) {
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5878
5879
  		if (mc.to)
  			mem_cgroup_move_charge(mm);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5880
5881
  		mmput(mm);
  	}
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5882
5883
  	if (mc.to)
  		mem_cgroup_clear_mc();
67e465a77   Balbir Singh   Memory controller...
5884
  }
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5885
  #else	/* !CONFIG_MMU */
eb95419b0   Tejun Heo   cgroup: pass arou...
5886
  static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5887
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5888
5889
5890
  {
  	return 0;
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5891
  static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5892
  				     struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5893
5894
  {
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5895
  static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5896
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5897
5898
5899
  {
  }
  #endif
67e465a77   Balbir Singh   Memory controller...
5900

f00baae7a   Tejun Heo   memcg: force use_...
5901
5902
  /*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
aa6ec29be   Tejun Heo   cgroup: remove sa...
5903
5904
   * to verify whether we're attached to the default hierarchy on each mount
   * attempt.
f00baae7a   Tejun Heo   memcg: force use_...
5905
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
5906
  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
f00baae7a   Tejun Heo   memcg: force use_...
5907
5908
  {
  	/*
aa6ec29be   Tejun Heo   cgroup: remove sa...
5909
  	 * use_hierarchy is forced on the default hierarchy.  cgroup core
f00baae7a   Tejun Heo   memcg: force use_...
5910
5911
5912
  	 * guarantees that @root doesn't have any children, so turning it
  	 * on for the root memcg is enough.
  	 */
aa6ec29be   Tejun Heo   cgroup: remove sa...
5913
  	if (cgroup_on_dfl(root_css->cgroup))
eb95419b0   Tejun Heo   cgroup: pass arou...
5914
  		mem_cgroup_from_css(root_css)->use_hierarchy = true;
f00baae7a   Tejun Heo   memcg: force use_...
5915
  }
073219e99   Tejun Heo   cgroup: clean up ...
5916
  struct cgroup_subsys memory_cgrp_subsys = {
92fb97487   Tejun Heo   cgroup: rename ->...
5917
  	.css_alloc = mem_cgroup_css_alloc,
d142e3e66   Glauber Costa   memcg: split part...
5918
  	.css_online = mem_cgroup_css_online,
92fb97487   Tejun Heo   cgroup: rename ->...
5919
5920
  	.css_offline = mem_cgroup_css_offline,
  	.css_free = mem_cgroup_css_free,
1ced953b1   Tejun Heo   blkcg, memcg: mak...
5921
  	.css_reset = mem_cgroup_css_reset,
7dc74be03   Daisuke Nishimura   memcg: add interf...
5922
5923
  	.can_attach = mem_cgroup_can_attach,
  	.cancel_attach = mem_cgroup_cancel_attach,
67e465a77   Balbir Singh   Memory controller...
5924
  	.attach = mem_cgroup_move_task,
f00baae7a   Tejun Heo   memcg: force use_...
5925
  	.bind = mem_cgroup_bind,
5577964e6   Tejun Heo   cgroup: rename cg...
5926
  	.legacy_cftypes = mem_cgroup_files,
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5927
  	.early_init = 0,
8cdea7c05   Balbir Singh   Memory controller...
5928
  };
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5929

c255a4580   Andrew Morton   memcg: rename con...
5930
  #ifdef CONFIG_MEMCG_SWAP
a42c390cf   Michal Hocko   cgroups: make swa...
5931
5932
  static int __init enable_swap_account(char *s)
  {
a2c8990ae   Michal Hocko   memsw: remove nos...
5933
  	if (!strcmp(s, "1"))
a42c390cf   Michal Hocko   cgroups: make swa...
5934
  		really_do_swap_account = 1;
a2c8990ae   Michal Hocko   memsw: remove nos...
5935
  	else if (!strcmp(s, "0"))
a42c390cf   Michal Hocko   cgroups: make swa...
5936
5937
5938
  		really_do_swap_account = 0;
  	return 1;
  }
a2c8990ae   Michal Hocko   memsw: remove nos...
5939
  __setup("swapaccount=", enable_swap_account);
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5940

2d11085e4   Michal Hocko   memcg: do not cre...
5941
5942
  static void __init memsw_file_init(void)
  {
2cf669a58   Tejun Heo   cgroup: replace c...
5943
5944
  	WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
  					  memsw_cgroup_files));
6acc8b025   Michal Hocko   memcg: clean up s...
5945
5946
5947
5948
5949
5950
5951
5952
  }
  
  static void __init enable_swap_cgroup(void)
  {
  	if (!mem_cgroup_disabled() && really_do_swap_account) {
  		do_swap_account = 1;
  		memsw_file_init();
  	}
2d11085e4   Michal Hocko   memcg: do not cre...
5953
  }
6acc8b025   Michal Hocko   memcg: clean up s...
5954

2d11085e4   Michal Hocko   memcg: do not cre...
5955
  #else
6acc8b025   Michal Hocko   memcg: clean up s...
5956
  static void __init enable_swap_cgroup(void)
2d11085e4   Michal Hocko   memcg: do not cre...
5957
5958
  {
  }
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5959
  #endif
2d11085e4   Michal Hocko   memcg: do not cre...
5960

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
  #ifdef CONFIG_MEMCG_SWAP
  /**
   * mem_cgroup_swapout - transfer a memsw charge to swap
   * @page: page whose memsw charge to transfer
   * @entry: swap entry to move the charge to
   *
   * Transfer the memsw charge of @page to @entry.
   */
  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  {
  	struct page_cgroup *pc;
  	unsigned short oldid;
  
  	VM_BUG_ON_PAGE(PageLRU(page), page);
  	VM_BUG_ON_PAGE(page_count(page), page);
  
  	if (!do_swap_account)
  		return;
  
  	pc = lookup_page_cgroup(page);
  
  	/* Readahead page, never charged */
  	if (!PageCgroupUsed(pc))
  		return;
  
  	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
  
  	oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
  	VM_BUG_ON_PAGE(oldid, page);
  
  	pc->flags &= ~PCG_MEMSW;
  	css_get(&pc->mem_cgroup->css);
  	mem_cgroup_swap_statistics(pc->mem_cgroup, true);
  }
  
  /**
   * mem_cgroup_uncharge_swap - uncharge a swap entry
   * @entry: swap entry to uncharge
   *
   * Drop the memsw charge associated with @entry.
   */
  void mem_cgroup_uncharge_swap(swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	unsigned short id;
  
  	if (!do_swap_account)
  		return;
  
  	id = swap_cgroup_record(entry, 0);
  	rcu_read_lock();
  	memcg = mem_cgroup_lookup(id);
  	if (memcg) {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
6014
6015
  		if (!mem_cgroup_is_root(memcg))
  			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6016
6017
6018
6019
6020
6021
  		mem_cgroup_swap_statistics(memcg, false);
  		css_put(&memcg->css);
  	}
  	rcu_read_unlock();
  }
  #endif
00501b531   Johannes Weiner   mm: memcontrol: r...
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
  /**
   * mem_cgroup_try_charge - try charging a page
   * @page: page to charge
   * @mm: mm context of the victim
   * @gfp_mask: reclaim mode
   * @memcgp: charged memcg return
   *
   * Try to charge @page to the memcg that @mm belongs to, reclaiming
   * pages according to @gfp_mask if necessary.
   *
   * Returns 0 on success, with *@memcgp pointing to the charged memcg.
   * Otherwise, an error code is returned.
   *
   * After page->mapping has been set up, the caller must finalize the
   * charge with mem_cgroup_commit_charge().  Or abort the transaction
   * with mem_cgroup_cancel_charge() in case page instantiation fails.
   */
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
  			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
  {
  	struct mem_cgroup *memcg = NULL;
  	unsigned int nr_pages = 1;
  	int ret = 0;
  
  	if (mem_cgroup_disabled())
  		goto out;
  
  	if (PageSwapCache(page)) {
  		struct page_cgroup *pc = lookup_page_cgroup(page);
  		/*
  		 * Every swap fault against a single page tries to charge the
  		 * page, bail as early as possible.  shmem_unuse() encounters
  		 * already charged pages, too.  The USED bit is protected by
  		 * the page lock, which serializes swap cache removal, which
  		 * in turn serializes uncharging.
  		 */
  		if (PageCgroupUsed(pc))
  			goto out;
  	}
  
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
  
  	if (do_swap_account && PageSwapCache(page))
  		memcg = try_get_mem_cgroup_from_page(page);
  	if (!memcg)
  		memcg = get_mem_cgroup_from_mm(mm);
  
  	ret = try_charge(memcg, gfp_mask, nr_pages);
  
  	css_put(&memcg->css);
  
  	if (ret == -EINTR) {
  		memcg = root_mem_cgroup;
  		ret = 0;
  	}
  out:
  	*memcgp = memcg;
  	return ret;
  }
  
  /**
   * mem_cgroup_commit_charge - commit a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   * @lrucare: page might be on LRU already
   *
   * Finalize a charge transaction started by mem_cgroup_try_charge(),
   * after page->mapping has been set up.  This must happen atomically
   * as part of the page instantiation, i.e. under the page table lock
   * for anonymous pages, under the page lock for page and swap cache.
   *
   * In addition, the page must not be on the LRU during the commit, to
   * prevent racing with task migration.  If it might be, use @lrucare.
   *
   * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
   */
  void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
  			      bool lrucare)
  {
  	unsigned int nr_pages = 1;
  
  	VM_BUG_ON_PAGE(!page->mapping, page);
  	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
6abb5a867   Johannes Weiner   mm: memcontrol: a...
6118
  	commit_charge(page, memcg, lrucare);
00501b531   Johannes Weiner   mm: memcontrol: r...
6119
6120
6121
6122
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
6abb5a867   Johannes Weiner   mm: memcontrol: a...
6123
6124
6125
6126
  	local_irq_disable();
  	mem_cgroup_charge_statistics(memcg, page, nr_pages);
  	memcg_check_events(memcg, page);
  	local_irq_enable();
00501b531   Johannes Weiner   mm: memcontrol: r...
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
  
  	if (do_swap_account && PageSwapCache(page)) {
  		swp_entry_t entry = { .val = page_private(page) };
  		/*
  		 * The swap entry might not get freed for a long time,
  		 * let's not wait for it.  The page already received a
  		 * memory+swap charge, drop the swap entry duplicate.
  		 */
  		mem_cgroup_uncharge_swap(entry);
  	}
  }
  
  /**
   * mem_cgroup_cancel_charge - cancel a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   *
   * Cancel a charge transaction started by mem_cgroup_try_charge().
   */
  void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
  {
  	unsigned int nr_pages = 1;
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
  
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
  
  	cancel_charge(memcg, nr_pages);
  }
747db954c   Johannes Weiner   mm: memcontrol: u...
6167
6168
6169
6170
6171
6172
  static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
  			   unsigned long nr_mem, unsigned long nr_memsw,
  			   unsigned long nr_anon, unsigned long nr_file,
  			   unsigned long nr_huge, struct page *dummy_page)
  {
  	unsigned long flags;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
6173
6174
6175
6176
6177
6178
6179
6180
6181
  	if (!mem_cgroup_is_root(memcg)) {
  		if (nr_mem)
  			res_counter_uncharge(&memcg->res,
  					     nr_mem * PAGE_SIZE);
  		if (nr_memsw)
  			res_counter_uncharge(&memcg->memsw,
  					     nr_memsw * PAGE_SIZE);
  		memcg_oom_recover(memcg);
  	}
747db954c   Johannes Weiner   mm: memcontrol: u...
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
  
  	local_irq_save(flags);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
  	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
  	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
  	memcg_check_events(memcg, dummy_page);
  	local_irq_restore(flags);
  }
  
  static void uncharge_list(struct list_head *page_list)
  {
  	struct mem_cgroup *memcg = NULL;
  	unsigned long nr_memsw = 0;
  	unsigned long nr_anon = 0;
  	unsigned long nr_file = 0;
  	unsigned long nr_huge = 0;
  	unsigned long pgpgout = 0;
  	unsigned long nr_mem = 0;
  	struct list_head *next;
  	struct page *page;
  
  	next = page_list->next;
  	do {
  		unsigned int nr_pages = 1;
  		struct page_cgroup *pc;
  
  		page = list_entry(next, struct page, lru);
  		next = page->lru.next;
  
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		VM_BUG_ON_PAGE(page_count(page), page);
  
  		pc = lookup_page_cgroup(page);
  		if (!PageCgroupUsed(pc))
  			continue;
  
  		/*
  		 * Nobody should be changing or seriously looking at
  		 * pc->mem_cgroup and pc->flags at this point, we have
  		 * fully exclusive access to the page.
  		 */
  
  		if (memcg != pc->mem_cgroup) {
  			if (memcg) {
  				uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
  					       nr_anon, nr_file, nr_huge, page);
  				pgpgout = nr_mem = nr_memsw = 0;
  				nr_anon = nr_file = nr_huge = 0;
  			}
  			memcg = pc->mem_cgroup;
  		}
  
  		if (PageTransHuge(page)) {
  			nr_pages <<= compound_order(page);
  			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  			nr_huge += nr_pages;
  		}
  
  		if (PageAnon(page))
  			nr_anon += nr_pages;
  		else
  			nr_file += nr_pages;
  
  		if (pc->flags & PCG_MEM)
  			nr_mem += nr_pages;
  		if (pc->flags & PCG_MEMSW)
  			nr_memsw += nr_pages;
  		pc->flags = 0;
  
  		pgpgout++;
  	} while (next != page_list);
  
  	if (memcg)
  		uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
  			       nr_anon, nr_file, nr_huge, page);
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6260
6261
6262
6263
6264
6265
6266
6267
6268
  /**
   * mem_cgroup_uncharge - uncharge a page
   * @page: page to uncharge
   *
   * Uncharge a page previously charged with mem_cgroup_try_charge() and
   * mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge(struct page *page)
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6269
  	struct page_cgroup *pc;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6270
6271
6272
  
  	if (mem_cgroup_disabled())
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
6273
  	/* Don't touch page->lru of any random page, pre-check: */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6274
  	pc = lookup_page_cgroup(page);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6275
6276
  	if (!PageCgroupUsed(pc))
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
6277
6278
6279
  	INIT_LIST_HEAD(&page->lru);
  	uncharge_list(&page->lru);
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6280

747db954c   Johannes Weiner   mm: memcontrol: u...
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
  /**
   * mem_cgroup_uncharge_list - uncharge a list of page
   * @page_list: list of pages to uncharge
   *
   * Uncharge a list of pages previously charged with
   * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge_list(struct list_head *page_list)
  {
  	if (mem_cgroup_disabled())
  		return;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6292

747db954c   Johannes Weiner   mm: memcontrol: u...
6293
6294
  	if (!list_empty(page_list))
  		uncharge_list(page_list);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
  }
  
  /**
   * mem_cgroup_migrate - migrate a charge to another page
   * @oldpage: currently charged page
   * @newpage: page to transfer the charge to
   * @lrucare: both pages might be on the LRU already
   *
   * Migrate the charge from @oldpage to @newpage.
   *
   * Both pages must be locked, @newpage->mapping must be set up.
   */
  void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
  			bool lrucare)
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6310
6311
6312
6313
6314
6315
6316
6317
  	struct page_cgroup *pc;
  	int isolated;
  
  	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
  	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
  	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
  	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
  	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6abb5a867   Johannes Weiner   mm: memcontrol: a...
6318
6319
  	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
  		       newpage);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
  
  	if (mem_cgroup_disabled())
  		return;
  
  	/* Page cache replacement: new page already charged? */
  	pc = lookup_page_cgroup(newpage);
  	if (PageCgroupUsed(pc))
  		return;
  
  	/* Re-entrant migration: old page already uncharged? */
  	pc = lookup_page_cgroup(oldpage);
  	if (!PageCgroupUsed(pc))
  		return;
  
  	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
  	VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6336
6337
6338
6339
6340
6341
6342
  	if (lrucare)
  		lock_page_lru(oldpage, &isolated);
  
  	pc->flags = 0;
  
  	if (lrucare)
  		unlock_page_lru(oldpage, isolated);
6abb5a867   Johannes Weiner   mm: memcontrol: a...
6343
  	commit_charge(newpage, pc->mem_cgroup, lrucare);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
6344
  }
2d11085e4   Michal Hocko   memcg: do not cre...
6345
  /*
1081312f9   Michal Hocko   memcg: cleanup me...
6346
6347
6348
6349
6350
6351
   * subsys_initcall() for memory controller.
   *
   * Some parts like hotcpu_notifier() have to be initialized from this context
   * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
   * everything that doesn't depend on a specific mem_cgroup structure should
   * be initialized from here.
2d11085e4   Michal Hocko   memcg: do not cre...
6352
6353
6354
6355
   */
  static int __init mem_cgroup_init(void)
  {
  	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6acc8b025   Michal Hocko   memcg: clean up s...
6356
  	enable_swap_cgroup();
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
6357
  	mem_cgroup_soft_limit_tree_init();
e47774962   Michal Hocko   memcg: move memcg...
6358
  	memcg_stock_init();
2d11085e4   Michal Hocko   memcg: do not cre...
6359
6360
6361
  	return 0;
  }
  subsys_initcall(mem_cgroup_init);