Blame view

mm/memcontrol.c 151 KB
8cdea7c05   Balbir Singh   Memory controller...
1
2
3
4
5
  /* memcontrol.c - Memory Controller
   *
   * Copyright IBM Corporation, 2007
   * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   *
78fb74669   Pavel Emelianov   Memory controller...
6
7
8
   * Copyright 2007 OpenVZ SWsoft Inc
   * Author: Pavel Emelianov <xemul@openvz.org>
   *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
9
10
11
12
   * Memory thresholds
   * Copyright (C) 2009 Nokia Corporation
   * Author: Kirill A. Shutemov
   *
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
13
14
15
16
   * Kernel Memory Controller
   * Copyright (C) 2012 Parallels Inc. and Google Inc.
   * Authors: Glauber Costa and Suleiman Souhlal
   *
1575e68b3   Johannes Weiner   mm: memcontrol: u...
17
18
19
20
21
22
   * Native page reclaim
   * Charge lifetime sanitation
   * Lockless page tracking & accounting
   * Unified hierarchy configuration model
   * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
   *
8cdea7c05   Balbir Singh   Memory controller...
23
24
25
26
27
28
29
30
31
32
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
33
  #include <linux/page_counter.h>
8cdea7c05   Balbir Singh   Memory controller...
34
35
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
78fb74669   Pavel Emelianov   Memory controller...
36
  #include <linux/mm.h>
4ffef5fef   Daisuke Nishimura   memcg: move charg...
37
  #include <linux/hugetlb.h>
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
38
  #include <linux/pagemap.h>
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
39
  #include <linux/smp.h>
8a9f3ccd2   Balbir Singh   Memory controller...
40
  #include <linux/page-flags.h>
66e1707bc   Balbir Singh   Memory controller...
41
  #include <linux/backing-dev.h>
8a9f3ccd2   Balbir Singh   Memory controller...
42
43
  #include <linux/bit_spinlock.h>
  #include <linux/rcupdate.h>
e222432bf   Balbir Singh   memcg: show memcg...
44
  #include <linux/limits.h>
b9e15bafd   Paul Gortmaker   mm: Add export.h ...
45
  #include <linux/export.h>
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
46
  #include <linux/mutex.h>
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
47
  #include <linux/rbtree.h>
b6ac57d50   Balbir Singh   memcgroup: move m...
48
  #include <linux/slab.h>
66e1707bc   Balbir Singh   Memory controller...
49
  #include <linux/swap.h>
024914477   Daisuke Nishimura   memcg: move charg...
50
  #include <linux/swapops.h>
66e1707bc   Balbir Singh   Memory controller...
51
  #include <linux/spinlock.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
52
  #include <linux/eventfd.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
53
  #include <linux/poll.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
54
  #include <linux/sort.h>
66e1707bc   Balbir Singh   Memory controller...
55
  #include <linux/fs.h>
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
56
  #include <linux/seq_file.h>
70ddf637e   Anton Vorontsov   memcg: add memory...
57
  #include <linux/vmpressure.h>
b69408e88   Christoph Lameter   vmscan: Use an in...
58
  #include <linux/mm_inline.h>
5d1ea48bd   Johannes Weiner   mm: page_cgroup: ...
59
  #include <linux/swap_cgroup.h>
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
60
  #include <linux/cpu.h>
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
61
  #include <linux/oom.h>
0056f4e66   Johannes Weiner   mm: memcg: lockde...
62
  #include <linux/lockdep.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
63
  #include <linux/file.h>
b23afb93d   Tejun Heo   memcg: punt high ...
64
  #include <linux/tracehook.h>
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
65
  #include "internal.h"
d1a4c0b37   Glauber Costa   tcp memory pressu...
66
  #include <net/sock.h>
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
67
  #include <net/ip.h>
f35c3a8ee   Qiang Huang   memcg, kmem: use ...
68
  #include "slab.h"
8cdea7c05   Balbir Singh   Memory controller...
69

8697d3319   Balbir Singh   Memory controller...
70
  #include <asm/uaccess.h>
cc8e970c3   KOSAKI Motohiro   memcg: add mm_vms...
71
  #include <trace/events/vmscan.h>
073219e99   Tejun Heo   cgroup: clean up ...
72
73
  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  EXPORT_SYMBOL(memory_cgrp_subsys);
68ae564bb   David Rientjes   mm, memcg: avoid ...
74

7d828602e   Johannes Weiner   mm: memcontrol: e...
75
  struct mem_cgroup *root_mem_cgroup __read_mostly;
a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
76
  #define MEM_CGROUP_RECLAIM_RETRIES	5
8cdea7c05   Balbir Singh   Memory controller...
77

f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
78
79
  /* Socket memory accounting disabled? */
  static bool cgroup_memory_nosocket;
04823c833   Vladimir Davydov   mm: memcontrol: a...
80
81
  /* Kernel memory accounting disabled? */
  static bool cgroup_memory_nokmem;
21afa38ee   Johannes Weiner   mm: memcontrol: c...
82
  /* Whether the swap controller is active */
c255a4580   Andrew Morton   memcg: rename con...
83
  #ifdef CONFIG_MEMCG_SWAP
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
84
  int do_swap_account __read_mostly;
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
85
  #else
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
86
  #define do_swap_account		0
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
87
  #endif
7941d2145   Johannes Weiner   mm: memcontrol: d...
88
89
90
91
92
  /* Whether legacy memory+swap accounting is active */
  static bool do_memsw_account(void)
  {
  	return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
  }
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
93
94
95
  static const char * const mem_cgroup_stat_names[] = {
  	"cache",
  	"rss",
b070e65c0   David Rientjes   mm, memcg: add rs...
96
  	"rss_huge",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
97
  	"mapped_file",
c4843a759   Greg Thelen   memcg: add per cg...
98
  	"dirty",
3ea67d06e   Sha Zhengju   memcg: add per cg...
99
  	"writeback",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
100
101
  	"swap",
  };
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
102
103
104
105
106
107
  static const char * const mem_cgroup_events_names[] = {
  	"pgpgin",
  	"pgpgout",
  	"pgfault",
  	"pgmajfault",
  };
58cf188ed   Sha Zhengju   memcg, oom: provi...
108
109
110
111
112
113
114
  static const char * const mem_cgroup_lru_names[] = {
  	"inactive_anon",
  	"active_anon",
  	"inactive_file",
  	"active_file",
  	"unevictable",
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
115
116
117
  #define THRESHOLDS_EVENTS_TARGET 128
  #define SOFTLIMIT_EVENTS_TARGET 1024
  #define NUMAINFO_EVENTS_TARGET	1024
e9f8974f2   Johannes Weiner   memcg: break out ...
118

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  /*
   * Cgroups above their limits are maintained in a RB-Tree, independent of
   * their hierarchy representation
   */
  
  struct mem_cgroup_tree_per_zone {
  	struct rb_root rb_root;
  	spinlock_t lock;
  };
  
  struct mem_cgroup_tree_per_node {
  	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_tree {
  	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
  };
  
  static struct mem_cgroup_tree soft_limit_tree __read_mostly;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
138
139
140
141
142
  /* for OOM */
  struct mem_cgroup_eventfd_list {
  	struct list_head list;
  	struct eventfd_ctx *eventfd;
  };
2e72b6347   Kirill A. Shutemov   memcg: implement ...
143

79bd9814e   Tejun Heo   cgroup, memcg: mo...
144
145
146
  /*
   * cgroup_event represents events which userspace want to receive.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
147
  struct mem_cgroup_event {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
148
  	/*
59b6f8734   Tejun Heo   memcg: make cgrou...
149
  	 * memcg which the event belongs to.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
150
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
151
  	struct mem_cgroup *memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
152
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
153
154
155
156
157
158
159
160
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
161
162
163
164
  	 * register_event() callback will be used to add new userspace
  	 * waiter for changes related to this event.  Use eventfd_signal()
  	 * on eventfd to send notification to userspace.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
165
  	int (*register_event)(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
166
  			      struct eventfd_ctx *eventfd, const char *args);
fba948078   Tejun Heo   cgroup, memcg: mo...
167
168
169
170
171
  	/*
  	 * unregister_event() callback will be called when userspace closes
  	 * the eventfd or on cgroup removing.  This callback must be set,
  	 * if you want provide notification functionality.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
172
  	void (*unregister_event)(struct mem_cgroup *memcg,
fba948078   Tejun Heo   cgroup, memcg: mo...
173
174
  				 struct eventfd_ctx *eventfd);
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
175
176
177
178
179
180
181
182
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
c0ff4b854   Raghavendra K T   memcg: rename mem...
183
184
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
185

7dc74be03   Daisuke Nishimura   memcg: add interf...
186
187
  /* Stuffs for move charges at task migration. */
  /*
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
188
   * Types of charges to be moved.
7dc74be03   Daisuke Nishimura   memcg: add interf...
189
   */
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
190
191
192
  #define MOVE_ANON	0x1U
  #define MOVE_FILE	0x2U
  #define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
7dc74be03   Daisuke Nishimura   memcg: add interf...
193

4ffef5fef   Daisuke Nishimura   memcg: move charg...
194
195
  /* "mc" and its members are protected by cgroup_mutex */
  static struct move_charge_struct {
b1dd693e5   Daisuke Nishimura   memcg: avoid dead...
196
  	spinlock_t	  lock; /* for from, to */
264a0ae16   Tejun Heo   memcg: relocate c...
197
  	struct mm_struct  *mm;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
198
199
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
200
  	unsigned long flags;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
201
  	unsigned long precharge;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
202
  	unsigned long moved_charge;
483c30b51   Daisuke Nishimura   memcg: improve pe...
203
  	unsigned long moved_swap;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
204
205
206
  	struct task_struct *moving_task;	/* a task moving charges */
  	wait_queue_head_t waitq;		/* a waitq for other context */
  } mc = {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
207
  	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
208
209
  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
210

4e4169535   Balbir Singh   memory controller...
211
212
213
214
  /*
   * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
215
  #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
216
  #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
4e4169535   Balbir Singh   memory controller...
217

217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
218
219
  enum charge_type {
  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
220
  	MEM_CGROUP_CHARGE_TYPE_ANON,
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
221
  	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
222
  	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
223
224
  	NR_CHARGE_TYPE,
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
225
  /* for encoding cft->private value on file */
86ae53e1a   Glauber Costa   memcg: change def...
226
227
228
229
  enum res_type {
  	_MEM,
  	_MEMSWAP,
  	_OOM_TYPE,
510fc4e11   Glauber Costa   memcg: kmem accou...
230
  	_KMEM,
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
231
  	_TCP,
86ae53e1a   Glauber Costa   memcg: change def...
232
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
233
234
  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
235
  #define MEMFILE_ATTR(val)	((val) & 0xffff)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
236
237
  /* Used for OOM nofiier */
  #define OOM_CONTROL		(0)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
238

70ddf637e   Anton Vorontsov   memcg: add memory...
239
240
241
242
243
244
245
246
247
248
249
250
  /* Some nice accessors for the vmpressure. */
  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
  {
  	if (!memcg)
  		memcg = root_mem_cgroup;
  	return &memcg->vmpressure;
  }
  
  struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
  {
  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
  }
7ffc0edc4   Michal Hocko   memcg: move mem_c...
251
252
253
254
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
  	return (memcg == root_mem_cgroup);
  }
127424c86   Johannes Weiner   mm: memcontrol: m...
255
  #ifndef CONFIG_SLOB
55007d849   Glauber Costa   memcg: allocate m...
256
  /*
f7ce3190c   Vladimir Davydov   slab: embed memcg...
257
   * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
b86278359   Li Zefan   memcg: stop using...
258
259
260
261
262
   * The main reason for not using cgroup id for this:
   *  this works better in sparse environments, where we have a lot of memcgs,
   *  but only a few kmem-limited. Or also, if we have, for instance, 200
   *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
   *  200 entry array for that.
55007d849   Glauber Costa   memcg: allocate m...
263
   *
dbcf73e26   Vladimir Davydov   memcg: rename som...
264
265
   * The current size of the caches array is stored in memcg_nr_cache_ids. It
   * will double each time we have to increase it.
55007d849   Glauber Costa   memcg: allocate m...
266
   */
dbcf73e26   Vladimir Davydov   memcg: rename som...
267
268
  static DEFINE_IDA(memcg_cache_ida);
  int memcg_nr_cache_ids;
749c54151   Glauber Costa   memcg: aggregate ...
269

05257a1a3   Vladimir Davydov   memcg: add rwsem ...
270
271
272
273
274
275
276
277
278
279
280
281
  /* Protects memcg_nr_cache_ids */
  static DECLARE_RWSEM(memcg_cache_ids_sem);
  
  void memcg_get_cache_ids(void)
  {
  	down_read(&memcg_cache_ids_sem);
  }
  
  void memcg_put_cache_ids(void)
  {
  	up_read(&memcg_cache_ids_sem);
  }
55007d849   Glauber Costa   memcg: allocate m...
282
283
284
285
286
287
  /*
   * MIN_SIZE is different than 1, because we would like to avoid going through
   * the alloc/free process all the time. In a small machine, 4 kmem-limited
   * cgroups is a reasonable guess. In the future, it could be a parameter or
   * tunable, but that is strictly not necessary.
   *
b86278359   Li Zefan   memcg: stop using...
288
   * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
55007d849   Glauber Costa   memcg: allocate m...
289
290
   * this constant directly from cgroup, but it is understandable that this is
   * better kept as an internal representation in cgroup.c. In any case, the
b86278359   Li Zefan   memcg: stop using...
291
   * cgrp_id space is not getting any smaller, and we don't have to necessarily
55007d849   Glauber Costa   memcg: allocate m...
292
293
294
   * increase ours as well if it increases.
   */
  #define MEMCG_CACHES_MIN_SIZE 4
b86278359   Li Zefan   memcg: stop using...
295
  #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
55007d849   Glauber Costa   memcg: allocate m...
296

d7f25f8a2   Glauber Costa   memcg: infrastruc...
297
298
299
300
301
302
  /*
   * A lot of the calls to the cache allocation functions are expected to be
   * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
   * conditional to this static branch, we'll have to allow modules that does
   * kmem_cache_alloc and the such to see this symbol as well
   */
ef12947c9   Johannes Weiner   mm: memcontrol: s...
303
  DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
304
  EXPORT_SYMBOL(memcg_kmem_enabled_key);
a8964b9b8   Glauber Costa   memcg: use static...
305

127424c86   Johannes Weiner   mm: memcontrol: m...
306
  #endif /* !CONFIG_SLOB */
a8964b9b8   Glauber Costa   memcg: use static...
307

f64c3f549   Balbir Singh   memory controller...
308
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
309
  mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
f64c3f549   Balbir Singh   memory controller...
310
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
311
312
  	int nid = zone_to_nid(zone);
  	int zid = zone_idx(zone);
54f72fe02   Johannes Weiner   memcg: clean up m...
313
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
314
  }
ad7fa852d   Tejun Heo   memcg: implement ...
315
316
317
318
319
320
321
322
323
324
  /**
   * mem_cgroup_css_from_page - css of the memcg associated with a page
   * @page: page of interest
   *
   * If memcg is bound to the default hierarchy, css of the memcg associated
   * with @page is returned.  The returned css remains associated with @page
   * until it is released.
   *
   * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
   * is returned.
ad7fa852d   Tejun Heo   memcg: implement ...
325
326
327
328
   */
  struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
  {
  	struct mem_cgroup *memcg;
ad7fa852d   Tejun Heo   memcg: implement ...
329
  	memcg = page->mem_cgroup;
9e10a130d   Tejun Heo   cgroup: replace c...
330
  	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
ad7fa852d   Tejun Heo   memcg: implement ...
331
  		memcg = root_mem_cgroup;
ad7fa852d   Tejun Heo   memcg: implement ...
332
333
  	return &memcg->css;
  }
2fc045247   Vladimir Davydov   memcg: add page_c...
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
  /**
   * page_cgroup_ino - return inode number of the memcg a page is charged to
   * @page: the page
   *
   * Look up the closest online ancestor of the memory cgroup @page is charged to
   * and return its inode number or 0 if @page is not charged to any cgroup. It
   * is safe to call this function without holding a reference to @page.
   *
   * Note, this function is inherently racy, because there is nothing to prevent
   * the cgroup inode from getting torn down and potentially reallocated a moment
   * after page_cgroup_ino() returns, so it only should be used by callers that
   * do not care (such as procfs interfaces).
   */
  ino_t page_cgroup_ino(struct page *page)
  {
  	struct mem_cgroup *memcg;
  	unsigned long ino = 0;
  
  	rcu_read_lock();
  	memcg = READ_ONCE(page->mem_cgroup);
  	while (memcg && !(memcg->css.flags & CSS_ONLINE))
  		memcg = parent_mem_cgroup(memcg);
  	if (memcg)
  		ino = cgroup_ino(memcg->css.cgroup);
  	rcu_read_unlock();
  	return ino;
  }
f64c3f549   Balbir Singh   memory controller...
361
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
362
  mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
f64c3f549   Balbir Singh   memory controller...
363
  {
97a6c37b3   Johannes Weiner   memcg: change pag...
364
365
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
f64c3f549   Balbir Singh   memory controller...
366

e231875ba   Jianyu Zhan   mm: memcontrol: c...
367
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
368
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
369
370
371
372
373
374
375
376
377
378
379
380
381
382
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_node_zone(int nid, int zid)
  {
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_from_page(struct page *page)
  {
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
  
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
383
384
  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
385
  					 unsigned long new_usage_in_excess)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
  {
  	struct rb_node **p = &mctz->rb_root.rb_node;
  	struct rb_node *parent = NULL;
  	struct mem_cgroup_per_zone *mz_node;
  
  	if (mz->on_tree)
  		return;
  
  	mz->usage_in_excess = new_usage_in_excess;
  	if (!mz->usage_in_excess)
  		return;
  	while (*p) {
  		parent = *p;
  		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
  					tree_node);
  		if (mz->usage_in_excess < mz_node->usage_in_excess)
  			p = &(*p)->rb_left;
  		/*
  		 * We can't avoid mem cgroups that are over their soft
  		 * limit by the same amount
  		 */
  		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
  			p = &(*p)->rb_right;
  	}
  	rb_link_node(&mz->tree_node, parent, p);
  	rb_insert_color(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = true;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
414
415
  static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
416
417
418
419
420
421
  {
  	if (!mz->on_tree)
  		return;
  	rb_erase(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = false;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
422
423
  static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  				       struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
424
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
425
426
427
  	unsigned long flags;
  
  	spin_lock_irqsave(&mctz->lock, flags);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
428
  	__mem_cgroup_remove_exceeded(mz, mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
429
  	spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
430
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
431
432
433
  static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
  {
  	unsigned long nr_pages = page_counter_read(&memcg->memory);
4db0c3c29   Jason Low   mm: remove rest o...
434
  	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
435
436
437
438
439
440
441
  	unsigned long excess = 0;
  
  	if (nr_pages > soft_limit)
  		excess = nr_pages - soft_limit;
  
  	return excess;
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
442
443
444
  
  static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
445
  	unsigned long excess;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
446
447
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
448

e231875ba   Jianyu Zhan   mm: memcontrol: c...
449
  	mctz = soft_limit_tree_from_page(page);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
450
451
452
453
454
  	/*
  	 * Necessary to update all ancestors when hierarchy is used.
  	 * because their event counter is not touched.
  	 */
  	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
455
  		mz = mem_cgroup_page_zoneinfo(memcg, page);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
456
  		excess = soft_limit_excess(memcg);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
457
458
459
460
461
  		/*
  		 * We have to update the tree if mz is on RB-tree or
  		 * mem is over its softlimit.
  		 */
  		if (excess || mz->on_tree) {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
462
463
464
  			unsigned long flags;
  
  			spin_lock_irqsave(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
465
466
  			/* if on-tree, remove it */
  			if (mz->on_tree)
cf2c81279   Johannes Weiner   mm: memcontrol: r...
467
  				__mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
468
469
470
471
  			/*
  			 * Insert again. mz->usage_in_excess will be updated.
  			 * If excess is 0, no tree ops.
  			 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
472
  			__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
473
  			spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
474
475
476
477
478
479
  		}
  	}
  }
  
  static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
  {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
480
  	struct mem_cgroup_tree_per_zone *mctz;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
481
482
  	struct mem_cgroup_per_zone *mz;
  	int nid, zid;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
483

e231875ba   Jianyu Zhan   mm: memcontrol: c...
484
485
486
487
  	for_each_node(nid) {
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			mctz = soft_limit_tree_node_zone(nid, zid);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
488
  			mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
  		}
  	}
  }
  
  static struct mem_cgroup_per_zone *
  __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct rb_node *rightmost = NULL;
  	struct mem_cgroup_per_zone *mz;
  
  retry:
  	mz = NULL;
  	rightmost = rb_last(&mctz->rb_root);
  	if (!rightmost)
  		goto done;		/* Nothing to reclaim from */
  
  	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
  	/*
  	 * Remove the node now but someone else can add it back,
  	 * we will to add it back at the end of reclaim to its correct
  	 * position in the tree.
  	 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
511
  	__mem_cgroup_remove_exceeded(mz, mctz);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
512
  	if (!soft_limit_excess(mz->memcg) ||
ec903c0c8   Tejun Heo   cgroup: rename cs...
513
  	    !css_tryget_online(&mz->memcg->css))
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
514
515
516
517
518
519
520
521
522
  		goto retry;
  done:
  	return mz;
  }
  
  static struct mem_cgroup_per_zone *
  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct mem_cgroup_per_zone *mz;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
523
  	spin_lock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
524
  	mz = __mem_cgroup_largest_soft_limit_node(mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
525
  	spin_unlock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
526
527
  	return mz;
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
528
  /*
484ebb3b8   Greg Thelen   memcg: make mem_c...
529
530
   * Return page count for single (non recursive) @memcg.
   *
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
531
532
533
534
535
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
484ebb3b8   Greg Thelen   memcg: make mem_c...
536
   * a periodic synchronization of counter in memcg's counter.
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
537
538
539
540
541
542
543
544
545
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
   * he accounts memory. Even if we provide quick-and-fuzzy read, we always
   * have to visit all online cpus and make sum. So, for now, unnecessary
   * synchronization is not implemented. (just implemented for cpu hotplug)
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
484ebb3b8   Greg Thelen   memcg: make mem_c...
546
   * common workload, threshold and synchronization as vmstat[] should be
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
547
548
   * implemented.
   */
484ebb3b8   Greg Thelen   memcg: make mem_c...
549
550
  static unsigned long
  mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
551
  {
7a159cc9d   Johannes Weiner   memcg: use native...
552
  	long val = 0;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
553
  	int cpu;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
554

484ebb3b8   Greg Thelen   memcg: make mem_c...
555
  	/* Per-cpu values can be negative, use a signed accumulator */
733a572e6   Tejun Heo   memcg: make mem_c...
556
  	for_each_possible_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
557
  		val += per_cpu(memcg->stat->count[idx], cpu);
484ebb3b8   Greg Thelen   memcg: make mem_c...
558
559
560
561
562
563
  	/*
  	 * Summing races with updates, so val may be negative.  Avoid exposing
  	 * transient negative values.
  	 */
  	if (val < 0)
  		val = 0;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
564
565
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
566
  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
e9f8974f2   Johannes Weiner   memcg: break out ...
567
568
569
570
  					    enum mem_cgroup_events_index idx)
  {
  	unsigned long val = 0;
  	int cpu;
733a572e6   Tejun Heo   memcg: make mem_c...
571
  	for_each_possible_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
572
  		val += per_cpu(memcg->stat->events[idx], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
573
574
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
575
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
b070e65c0   David Rientjes   mm, memcg: add rs...
576
  					 struct page *page,
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
577
  					 bool compound, int nr_pages)
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
578
  {
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
579
580
581
582
  	/*
  	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
  	 * counted as CACHE even if it's on ANON LRU.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
583
  	if (PageAnon(page))
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
584
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
c0ff4b854   Raghavendra K T   memcg: rename mem...
585
  				nr_pages);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
586
  	else
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
587
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
c0ff4b854   Raghavendra K T   memcg: rename mem...
588
  				nr_pages);
55e462b05   Balaji Rao   memcg: simple sta...
589

f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
590
591
  	if (compound) {
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
b070e65c0   David Rientjes   mm, memcg: add rs...
592
593
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
  				nr_pages);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
594
  	}
b070e65c0   David Rientjes   mm, memcg: add rs...
595

e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
596
597
  	/* pagein of a big page is an event. So, ignore page size */
  	if (nr_pages > 0)
c0ff4b854   Raghavendra K T   memcg: rename mem...
598
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
599
  	else {
c0ff4b854   Raghavendra K T   memcg: rename mem...
600
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
601
602
  		nr_pages = -nr_pages; /* for event */
  	}
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
603

13114716c   Johannes Weiner   mm: memcg: keep r...
604
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
605
  }
0a6b76dd2   Vladimir Davydov   mm: workingset: m...
606
607
  unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
  					   int nid, unsigned int lru_mask)
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
608
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
609
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
610
  	int zid;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
611
  	VM_BUG_ON((unsigned)nid >= nr_node_ids);
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
612

e231875ba   Jianyu Zhan   mm: memcontrol: c...
613
614
615
616
617
618
619
620
621
622
623
624
  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  		struct mem_cgroup_per_zone *mz;
  		enum lru_list lru;
  
  		for_each_lru(lru) {
  			if (!(BIT(lru) & lru_mask))
  				continue;
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			nr += mz->lru_size[lru];
  		}
  	}
  	return nr;
889976dbc   Ying Han   memcg: reclaim me...
625
  }
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
626

c0ff4b854   Raghavendra K T   memcg: rename mem...
627
  static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
628
  			unsigned int lru_mask)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
629
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
630
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
631
  	int nid;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
632

31aaea4aa   Lai Jiangshan   memcontrol: use N...
633
  	for_each_node_state(nid, N_MEMORY)
e231875ba   Jianyu Zhan   mm: memcontrol: c...
634
635
  		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
  	return nr;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
636
  }
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
637
638
  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  				       enum mem_cgroup_events_target target)
7a159cc9d   Johannes Weiner   memcg: use native...
639
640
  {
  	unsigned long val, next;
13114716c   Johannes Weiner   mm: memcg: keep r...
641
  	val = __this_cpu_read(memcg->stat->nr_page_events);
4799401fe   Steven Rostedt   memcg: Fix race c...
642
  	next = __this_cpu_read(memcg->stat->targets[target]);
7a159cc9d   Johannes Weiner   memcg: use native...
643
  	/* from time_after() in jiffies.h */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
644
645
646
647
648
  	if ((long)next - (long)val < 0) {
  		switch (target) {
  		case MEM_CGROUP_TARGET_THRESH:
  			next = val + THRESHOLDS_EVENTS_TARGET;
  			break;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
649
650
651
  		case MEM_CGROUP_TARGET_SOFTLIMIT:
  			next = val + SOFTLIMIT_EVENTS_TARGET;
  			break;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
652
653
654
655
656
657
658
659
  		case MEM_CGROUP_TARGET_NUMAINFO:
  			next = val + NUMAINFO_EVENTS_TARGET;
  			break;
  		default:
  			break;
  		}
  		__this_cpu_write(memcg->stat->targets[target], next);
  		return true;
7a159cc9d   Johannes Weiner   memcg: use native...
660
  	}
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
661
  	return false;
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
662
663
664
665
666
667
  }
  
  /*
   * Check events in order.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
668
  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
669
670
  {
  	/* threshold event is triggered in finer grain than soft limit */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
671
672
  	if (unlikely(mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_THRESH))) {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
673
  		bool do_softlimit;
82b3f2a71   Andrew Morton   mm/memcontrol.c: ...
674
  		bool do_numainfo __maybe_unused;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
675

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
676
677
  		do_softlimit = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_SOFTLIMIT);
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
678
679
680
681
  #if MAX_NUMNODES > 1
  		do_numainfo = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_NUMAINFO);
  #endif
c0ff4b854   Raghavendra K T   memcg: rename mem...
682
  		mem_cgroup_threshold(memcg);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
683
684
  		if (unlikely(do_softlimit))
  			mem_cgroup_update_tree(memcg, page);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
685
  #if MAX_NUMNODES > 1
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
686
  		if (unlikely(do_numainfo))
c0ff4b854   Raghavendra K T   memcg: rename mem...
687
  			atomic_inc(&memcg->numainfo_events);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
688
  #endif
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
689
  	}
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
690
  }
cf475ad28   Balbir Singh   cgroups: add an o...
691
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb74669   Pavel Emelianov   Memory controller...
692
  {
31a78f23b   Balbir Singh   mm owner: fix rac...
693
694
695
696
697
698
699
  	/*
  	 * mm_update_next_owner() may clear mm->owner to NULL
  	 * if it races with swapoff, page migration, etc.
  	 * So this can be called with p == NULL.
  	 */
  	if (unlikely(!p))
  		return NULL;
073219e99   Tejun Heo   cgroup: clean up ...
700
  	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
78fb74669   Pavel Emelianov   Memory controller...
701
  }
33398cf2f   Michal Hocko   memcg: export str...
702
  EXPORT_SYMBOL(mem_cgroup_from_task);
78fb74669   Pavel Emelianov   Memory controller...
703

df3819754   Johannes Weiner   memcg: get_mem_cg...
704
  static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
705
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
706
  	struct mem_cgroup *memcg = NULL;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
707

54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
708
709
  	rcu_read_lock();
  	do {
6f6acb005   Michal Hocko   memcg: fix swapca...
710
711
712
713
714
715
  		/*
  		 * Page cache insertions can happen withou an
  		 * actual mm context, e.g. during disk probing
  		 * on boot, loopback IO, acct() writes etc.
  		 */
  		if (unlikely(!mm))
df3819754   Johannes Weiner   memcg: get_mem_cg...
716
  			memcg = root_mem_cgroup;
6f6acb005   Michal Hocko   memcg: fix swapca...
717
718
719
720
721
  		else {
  			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  			if (unlikely(!memcg))
  				memcg = root_mem_cgroup;
  		}
ec903c0c8   Tejun Heo   cgroup: rename cs...
722
  	} while (!css_tryget_online(&memcg->css));
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
723
  	rcu_read_unlock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
724
  	return memcg;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
725
  }
5660048cc   Johannes Weiner   mm: move memcg hi...
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
  /**
   * mem_cgroup_iter - iterate over memory cgroup hierarchy
   * @root: hierarchy root
   * @prev: previously returned memcg, NULL on first invocation
   * @reclaim: cookie for shared reclaim walks, NULL for full walks
   *
   * Returns references to children of the hierarchy below @root, or
   * @root itself, or %NULL after a full round-trip.
   *
   * Caller must pass the return value in @prev on subsequent
   * invocations for reference counting, or use mem_cgroup_iter_break()
   * to cancel a hierarchy walk before the round-trip is complete.
   *
   * Reclaimers can specify a zone and a priority level in @reclaim to
   * divide up the memcgs in the hierarchy among all concurrent
   * reclaimers operating on the same zone and priority.
   */
694fbc0fe   Andrew Morton   revert "memcg: en...
743
  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
5660048cc   Johannes Weiner   mm: move memcg hi...
744
  				   struct mem_cgroup *prev,
694fbc0fe   Andrew Morton   revert "memcg: en...
745
  				   struct mem_cgroup_reclaim_cookie *reclaim)
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
746
  {
33398cf2f   Michal Hocko   memcg: export str...
747
  	struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
748
  	struct cgroup_subsys_state *css = NULL;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
749
  	struct mem_cgroup *memcg = NULL;
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
750
  	struct mem_cgroup *pos = NULL;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
751

694fbc0fe   Andrew Morton   revert "memcg: en...
752
753
  	if (mem_cgroup_disabled())
  		return NULL;
5660048cc   Johannes Weiner   mm: move memcg hi...
754

9f3a0d093   Johannes Weiner   mm: memcg: consol...
755
756
  	if (!root)
  		root = root_mem_cgroup;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
757

9f3a0d093   Johannes Weiner   mm: memcg: consol...
758
  	if (prev && !reclaim)
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
759
  		pos = prev;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
760

9f3a0d093   Johannes Weiner   mm: memcg: consol...
761
762
  	if (!root->use_hierarchy && root != root_mem_cgroup) {
  		if (prev)
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
763
  			goto out;
694fbc0fe   Andrew Morton   revert "memcg: en...
764
  		return root;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
765
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
766

542f85f9a   Michal Hocko   memcg: rework mem...
767
  	rcu_read_lock();
5f5781619   Michal Hocko   memcg: relax memc...
768

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
769
770
771
772
773
774
775
776
  	if (reclaim) {
  		struct mem_cgroup_per_zone *mz;
  
  		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
  		iter = &mz->iter[reclaim->priority];
  
  		if (prev && reclaim->generation != iter->generation)
  			goto out_unlock;
6df38689e   Vladimir Davydov   mm: memcontrol: f...
777
  		while (1) {
4db0c3c29   Jason Low   mm: remove rest o...
778
  			pos = READ_ONCE(iter->position);
6df38689e   Vladimir Davydov   mm: memcontrol: f...
779
780
  			if (!pos || css_tryget(&pos->css))
  				break;
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
781
  			/*
6df38689e   Vladimir Davydov   mm: memcontrol: f...
782
783
784
785
786
787
  			 * css reference reached zero, so iter->position will
  			 * be cleared by ->css_released. However, we should not
  			 * rely on this happening soon, because ->css_released
  			 * is called from a work queue, and by busy-waiting we
  			 * might block it. So we clear iter->position right
  			 * away.
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
788
  			 */
6df38689e   Vladimir Davydov   mm: memcontrol: f...
789
790
  			(void)cmpxchg(&iter->position, pos, NULL);
  		}
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
  	}
  
  	if (pos)
  		css = &pos->css;
  
  	for (;;) {
  		css = css_next_descendant_pre(css, &root->css);
  		if (!css) {
  			/*
  			 * Reclaimers share the hierarchy walk, and a
  			 * new one might jump in right at the end of
  			 * the hierarchy - make sure they see at least
  			 * one group and restart from the beginning.
  			 */
  			if (!prev)
  				continue;
  			break;
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
808
  		}
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
809

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
810
811
812
813
814
815
  		/*
  		 * Verify the css and acquire a reference.  The root
  		 * is provided by the caller, so we know it's alive
  		 * and kicking, and don't take an extra reference.
  		 */
  		memcg = mem_cgroup_from_css(css);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
816

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
817
818
  		if (css == &root->css)
  			break;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
819

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
820
821
  		if (css_tryget(css))
  			break;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
822

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
823
  		memcg = NULL;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
824
  	}
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
825
826
  
  	if (reclaim) {
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
827
  		/*
6df38689e   Vladimir Davydov   mm: memcontrol: f...
828
829
830
  		 * The position could have already been updated by a competing
  		 * thread, so check that the value hasn't changed since we read
  		 * it to avoid reclaiming from the same cgroup twice.
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
831
  		 */
6df38689e   Vladimir Davydov   mm: memcontrol: f...
832
  		(void)cmpxchg(&iter->position, pos, memcg);
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
833
834
835
836
837
838
839
  		if (pos)
  			css_put(&pos->css);
  
  		if (!memcg)
  			iter->generation++;
  		else if (!prev)
  			reclaim->generation = iter->generation;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
840
  	}
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
841

542f85f9a   Michal Hocko   memcg: rework mem...
842
843
  out_unlock:
  	rcu_read_unlock();
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
844
  out:
c40046f3a   Michal Hocko   memcg: keep prev'...
845
846
  	if (prev && prev != root)
  		css_put(&prev->css);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
847
  	return memcg;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
848
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
849

5660048cc   Johannes Weiner   mm: move memcg hi...
850
851
852
853
854
855
856
  /**
   * mem_cgroup_iter_break - abort a hierarchy walk prematurely
   * @root: hierarchy root
   * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
   */
  void mem_cgroup_iter_break(struct mem_cgroup *root,
  			   struct mem_cgroup *prev)
9f3a0d093   Johannes Weiner   mm: memcg: consol...
857
858
859
860
861
862
  {
  	if (!root)
  		root = root_mem_cgroup;
  	if (prev && prev != root)
  		css_put(&prev->css);
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
863

6df38689e   Vladimir Davydov   mm: memcontrol: f...
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
  static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
  {
  	struct mem_cgroup *memcg = dead_memcg;
  	struct mem_cgroup_reclaim_iter *iter;
  	struct mem_cgroup_per_zone *mz;
  	int nid, zid;
  	int i;
  
  	while ((memcg = parent_mem_cgroup(memcg))) {
  		for_each_node(nid) {
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  				for (i = 0; i <= DEF_PRIORITY; i++) {
  					iter = &mz->iter[i];
  					cmpxchg(&iter->position,
  						dead_memcg, NULL);
  				}
  			}
  		}
  	}
  }
9f3a0d093   Johannes Weiner   mm: memcg: consol...
885
886
887
888
889
890
  /*
   * Iteration constructs for visiting all cgroups (under a tree).  If
   * loops are exited prematurely (break), mem_cgroup_iter_break() must
   * be used for reference counting.
   */
  #define for_each_mem_cgroup_tree(iter, root)		\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
891
  	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
892
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
893
  	     iter = mem_cgroup_iter(root, iter, NULL))
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
894

9f3a0d093   Johannes Weiner   mm: memcg: consol...
895
  #define for_each_mem_cgroup(iter)			\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
896
  	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
897
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
898
  	     iter = mem_cgroup_iter(NULL, iter, NULL))
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
899

925b7673c   Johannes Weiner   mm: make per-memc...
900
901
902
  /**
   * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
   * @zone: zone of the wanted lruvec
fa9add641   Hugh Dickins   mm/memcg: apply a...
903
   * @memcg: memcg of the wanted lruvec
925b7673c   Johannes Weiner   mm: make per-memc...
904
905
906
907
908
909
910
911
912
   *
   * Returns the lru list vector holding pages for the given @zone and
   * @mem.  This can be the global zone lruvec, if the memory controller
   * is disabled.
   */
  struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
  				      struct mem_cgroup *memcg)
  {
  	struct mem_cgroup_per_zone *mz;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
913
  	struct lruvec *lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
914

bea8c150a   Hugh Dickins   memcg: fix hotplu...
915
916
917
918
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
919

e231875ba   Jianyu Zhan   mm: memcontrol: c...
920
  	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
921
922
923
924
925
926
927
928
929
930
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
931
  }
925b7673c   Johannes Weiner   mm: make per-memc...
932
  /**
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
933
   * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
925b7673c   Johannes Weiner   mm: make per-memc...
934
   * @page: the page
fa9add641   Hugh Dickins   mm/memcg: apply a...
935
   * @zone: zone of the page
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
936
937
938
939
   *
   * This function is only safe when following the LRU page isolation
   * and putback protocol: the LRU lock must be held, and the page must
   * either be PageLRU() or the caller must have isolated/allocated it.
925b7673c   Johannes Weiner   mm: make per-memc...
940
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
941
  struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
942
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
943
  	struct mem_cgroup_per_zone *mz;
925b7673c   Johannes Weiner   mm: make per-memc...
944
  	struct mem_cgroup *memcg;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
945
  	struct lruvec *lruvec;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
946

bea8c150a   Hugh Dickins   memcg: fix hotplu...
947
948
949
950
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
951

1306a85ae   Johannes Weiner   mm: embed the mem...
952
  	memcg = page->mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
953
  	/*
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
954
  	 * Swapcache readahead pages are added to the LRU - and
298333157   Johannes Weiner   mm: memcontrol: r...
955
  	 * possibly migrated - before they are charged.
7512102cf   Hugh Dickins   memcg: fix GPF wh...
956
  	 */
298333157   Johannes Weiner   mm: memcontrol: r...
957
958
  	if (!memcg)
  		memcg = root_mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
959

e231875ba   Jianyu Zhan   mm: memcontrol: c...
960
  	mz = mem_cgroup_page_zoneinfo(memcg, page);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
961
962
963
964
965
966
967
968
969
970
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
971
  }
b69408e88   Christoph Lameter   vmscan: Use an in...
972

925b7673c   Johannes Weiner   mm: make per-memc...
973
  /**
fa9add641   Hugh Dickins   mm/memcg: apply a...
974
975
976
977
   * mem_cgroup_update_lru_size - account for adding or removing an lru page
   * @lruvec: mem_cgroup per zone lru vector
   * @lru: index of lru list the page is sitting on
   * @nr_pages: positive when adding or negative when removing
925b7673c   Johannes Weiner   mm: make per-memc...
978
   *
ca707239e   Hugh Dickins   mm: update_lru_si...
979
980
981
   * This function must be called under lru_lock, just before a page is added
   * to or just after a page is removed from an lru list (that ordering being
   * so as to allow it to check that lru_size 0 is consistent with list_empty).
3f58a8294   Minchan Kim   memcg: move memcg...
982
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
983
984
  void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
  				int nr_pages)
3f58a8294   Minchan Kim   memcg: move memcg...
985
986
  {
  	struct mem_cgroup_per_zone *mz;
fa9add641   Hugh Dickins   mm/memcg: apply a...
987
  	unsigned long *lru_size;
ca707239e   Hugh Dickins   mm: update_lru_si...
988
989
  	long size;
  	bool empty;
3f58a8294   Minchan Kim   memcg: move memcg...
990

9d5e6a9f2   Hugh Dickins   mm: update_lru_si...
991
  	__update_lru_size(lruvec, lru, nr_pages);
3f58a8294   Minchan Kim   memcg: move memcg...
992
993
  	if (mem_cgroup_disabled())
  		return;
fa9add641   Hugh Dickins   mm/memcg: apply a...
994
995
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	lru_size = mz->lru_size + lru;
ca707239e   Hugh Dickins   mm: update_lru_si...
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
  	empty = list_empty(lruvec->lists + lru);
  
  	if (nr_pages < 0)
  		*lru_size += nr_pages;
  
  	size = *lru_size;
  	if (WARN_ONCE(size < 0 || empty != !size,
  		"%s(%p, %d, %d): lru_size %ld but %sempty
  ",
  		__func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
  		VM_BUG_ON(1);
  		*lru_size = 0;
  	}
  
  	if (nr_pages > 0)
  		*lru_size += nr_pages;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1012
  }
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
1013

2314b42db   Johannes Weiner   mm: memcontrol: d...
1014
  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1015
  {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1016
  	struct mem_cgroup *task_memcg;
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1017
  	struct task_struct *p;
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1018
  	bool ret;
4c4a22148   David Rientjes   memcontrol: move ...
1019

158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1020
  	p = find_lock_task_mm(task);
de077d222   David Rientjes   oom, memcg: fix e...
1021
  	if (p) {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1022
  		task_memcg = get_mem_cgroup_from_mm(p->mm);
de077d222   David Rientjes   oom, memcg: fix e...
1023
1024
1025
1026
1027
1028
1029
  		task_unlock(p);
  	} else {
  		/*
  		 * All threads may have already detached their mm's, but the oom
  		 * killer still needs to detect if they have already been oom
  		 * killed to prevent needlessly killing additional tasks.
  		 */
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1030
  		rcu_read_lock();
2314b42db   Johannes Weiner   mm: memcontrol: d...
1031
1032
  		task_memcg = mem_cgroup_from_task(task);
  		css_get(&task_memcg->css);
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1033
  		rcu_read_unlock();
de077d222   David Rientjes   oom, memcg: fix e...
1034
  	}
2314b42db   Johannes Weiner   mm: memcontrol: d...
1035
1036
  	ret = mem_cgroup_is_descendant(task_memcg, memcg);
  	css_put(&task_memcg->css);
4c4a22148   David Rientjes   memcontrol: move ...
1037
1038
  	return ret;
  }
19942822d   Johannes Weiner   memcg: prevent en...
1039
  /**
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1040
   * mem_cgroup_margin - calculate chargeable space of a memory cgroup
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1041
   * @memcg: the memory cgroup
19942822d   Johannes Weiner   memcg: prevent en...
1042
   *
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1043
   * Returns the maximum amount of memory @mem can be charged with, in
7ec99d621   Johannes Weiner   memcg: unify char...
1044
   * pages.
19942822d   Johannes Weiner   memcg: prevent en...
1045
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1046
  static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
19942822d   Johannes Weiner   memcg: prevent en...
1047
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1048
1049
1050
  	unsigned long margin = 0;
  	unsigned long count;
  	unsigned long limit;
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1051

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1052
  	count = page_counter_read(&memcg->memory);
4db0c3c29   Jason Low   mm: remove rest o...
1053
  	limit = READ_ONCE(memcg->memory.limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1054
1055
  	if (count < limit)
  		margin = limit - count;
7941d2145   Johannes Weiner   mm: memcontrol: d...
1056
  	if (do_memsw_account()) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1057
  		count = page_counter_read(&memcg->memsw);
4db0c3c29   Jason Low   mm: remove rest o...
1058
  		limit = READ_ONCE(memcg->memsw.limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1059
1060
1061
1062
1063
  		if (count <= limit)
  			margin = min(margin, limit - count);
  	}
  
  	return margin;
19942822d   Johannes Weiner   memcg: prevent en...
1064
  }
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1065
  /*
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1066
   * A routine for checking "mem" is under move_account() or not.
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1067
   *
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1068
1069
1070
   * Checking a cgroup is mc.from or mc.to or under hierarchy of
   * moving cgroups. This is for waiting at high-memory pressure
   * caused by "move".
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1071
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1072
  static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1073
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1074
1075
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1076
  	bool ret = false;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1077
1078
1079
1080
1081
1082
1083
1084
1085
  	/*
  	 * Unlike task_move routines, we access mc.to, mc.from not under
  	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
  	 */
  	spin_lock(&mc.lock);
  	from = mc.from;
  	to = mc.to;
  	if (!from)
  		goto unlock;
3e92041d6   Michal Hocko   memcg: add mem_cg...
1086

2314b42db   Johannes Weiner   mm: memcontrol: d...
1087
1088
  	ret = mem_cgroup_is_descendant(from, memcg) ||
  		mem_cgroup_is_descendant(to, memcg);
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1089
1090
  unlock:
  	spin_unlock(&mc.lock);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1091
1092
  	return ret;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1093
  static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1094
1095
  {
  	if (mc.moving_task && current != mc.moving_task) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1096
  		if (mem_cgroup_under_move(memcg)) {
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
  			DEFINE_WAIT(wait);
  			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
  			/* moving charge context might have finished. */
  			if (mc.moving_task)
  				schedule();
  			finish_wait(&mc.waitq, &wait);
  			return true;
  		}
  	}
  	return false;
  }
58cf188ed   Sha Zhengju   memcg, oom: provi...
1108
  #define K(x) ((x) << (PAGE_SHIFT-10))
e222432bf   Balbir Singh   memcg: show memcg...
1109
  /**
58cf188ed   Sha Zhengju   memcg, oom: provi...
1110
   * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
e222432bf   Balbir Singh   memcg: show memcg...
1111
1112
1113
1114
1115
1116
1117
1118
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
   * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
   * enabled
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
58cf188ed   Sha Zhengju   memcg, oom: provi...
1119
1120
  	struct mem_cgroup *iter;
  	unsigned int i;
e222432bf   Balbir Singh   memcg: show memcg...
1121

e222432bf   Balbir Singh   memcg: show memcg...
1122
  	rcu_read_lock();
2415b9f5c   Balasubramani Vivekanandan   memcg: print cgro...
1123
1124
1125
1126
1127
1128
1129
  	if (p) {
  		pr_info("Task in ");
  		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
  		pr_cont(" killed as a result of limit of ");
  	} else {
  		pr_info("Memory limit reached of cgroup ");
  	}
e61734c55   Tejun Heo   cgroup: remove cg...
1130
  	pr_cont_cgroup_path(memcg->css.cgroup);
0346dadbf   Greg Thelen   memcg: remove ext...
1131
1132
  	pr_cont("
  ");
e222432bf   Balbir Singh   memcg: show memcg...
1133

e222432bf   Balbir Singh   memcg: show memcg...
1134
  	rcu_read_unlock();
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
  	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->memory)),
  		K((u64)memcg->memory.limit), memcg->memory.failcnt);
  	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->memsw)),
  		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
  	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->kmem)),
  		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1147
1148
  
  	for_each_mem_cgroup_tree(iter, memcg) {
e61734c55   Tejun Heo   cgroup: remove cg...
1149
1150
  		pr_info("Memory cgroup stats for ");
  		pr_cont_cgroup_path(iter->css.cgroup);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1151
1152
1153
  		pr_cont(":");
  
  		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
37e843511   Vladimir Davydov   mm: memcontrol: c...
1154
  			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
58cf188ed   Sha Zhengju   memcg, oom: provi...
1155
  				continue;
484ebb3b8   Greg Thelen   memcg: make mem_c...
1156
  			pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
58cf188ed   Sha Zhengju   memcg, oom: provi...
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
  				K(mem_cgroup_read_stat(iter, i)));
  		}
  
  		for (i = 0; i < NR_LRU_LISTS; i++)
  			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
  				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
  
  		pr_cont("
  ");
  	}
e222432bf   Balbir Singh   memcg: show memcg...
1167
  }
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1168
1169
1170
1171
  /*
   * This function returns the number of memcg under hierarchy tree. Returns
   * 1(self count) if no children.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1172
  static int mem_cgroup_count_children(struct mem_cgroup *memcg)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1173
1174
  {
  	int num = 0;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1175
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1176
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1177
  		num++;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1178
1179
  	return num;
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1180
  /*
a63d83f42   David Rientjes   oom: badness heur...
1181
1182
   * Return the memory (and swap, if configured) limit for a memcg.
   */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1183
  static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
a63d83f42   David Rientjes   oom: badness heur...
1184
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1185
  	unsigned long limit;
f3e8eb70b   Johannes Weiner   memcg: fix unit m...
1186

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1187
  	limit = memcg->memory.limit;
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1188
  	if (mem_cgroup_swappiness(memcg)) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1189
  		unsigned long memsw_limit;
37e843511   Vladimir Davydov   mm: memcontrol: c...
1190
  		unsigned long swap_limit;
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1191

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1192
  		memsw_limit = memcg->memsw.limit;
37e843511   Vladimir Davydov   mm: memcontrol: c...
1193
1194
1195
  		swap_limit = memcg->swap.limit;
  		swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
  		limit = min(limit + swap_limit, memsw_limit);
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1196
  	}
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1197
  	return limit;
a63d83f42   David Rientjes   oom: badness heur...
1198
  }
b6e6edcfa   Johannes Weiner   mm: memcontrol: r...
1199
  static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
19965460e   David Rientjes   mm, memcg: make m...
1200
  				     int order)
9cbb78bb3   David Rientjes   mm, memcg: introd...
1201
  {
6e0fc46dc   David Rientjes   mm, oom: organize...
1202
1203
1204
1205
1206
  	struct oom_control oc = {
  		.zonelist = NULL,
  		.nodemask = NULL,
  		.gfp_mask = gfp_mask,
  		.order = order,
6e0fc46dc   David Rientjes   mm, oom: organize...
1207
  	};
9cbb78bb3   David Rientjes   mm, memcg: introd...
1208
1209
1210
1211
1212
  	struct mem_cgroup *iter;
  	unsigned long chosen_points = 0;
  	unsigned long totalpages;
  	unsigned int points = 0;
  	struct task_struct *chosen = NULL;
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1213
  	mutex_lock(&oom_lock);
876aafbfd   David Rientjes   mm, memcg: move a...
1214
  	/*
465adcf1e   David Rientjes   mm, memcg: give e...
1215
1216
1217
  	 * If current has a pending SIGKILL or is exiting, then automatically
  	 * select it.  The goal is to allow it to allocate so that it may
  	 * quickly exit and free its memory.
876aafbfd   David Rientjes   mm, memcg: move a...
1218
  	 */
d003f371b   Oleg Nesterov   oom: don't assume...
1219
  	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
16e951966   Johannes Weiner   mm: oom_kill: cle...
1220
  		mark_oom_victim(current);
3ef22dfff   Michal Hocko   oom, oom_reaper: ...
1221
  		try_oom_reaper(current);
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1222
  		goto unlock;
876aafbfd   David Rientjes   mm, memcg: move a...
1223
  	}
6e0fc46dc   David Rientjes   mm, oom: organize...
1224
  	check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1225
  	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1226
  	for_each_mem_cgroup_tree(iter, memcg) {
72ec70299   Tejun Heo   cgroup: make task...
1227
  		struct css_task_iter it;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1228
  		struct task_struct *task;
72ec70299   Tejun Heo   cgroup: make task...
1229
1230
  		css_task_iter_start(&iter->css, &it);
  		while ((task = css_task_iter_next(&it))) {
6e0fc46dc   David Rientjes   mm, oom: organize...
1231
  			switch (oom_scan_process_thread(&oc, task, totalpages)) {
9cbb78bb3   David Rientjes   mm, memcg: introd...
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
  			case OOM_SCAN_SELECT:
  				if (chosen)
  					put_task_struct(chosen);
  				chosen = task;
  				chosen_points = ULONG_MAX;
  				get_task_struct(chosen);
  				/* fall through */
  			case OOM_SCAN_CONTINUE:
  				continue;
  			case OOM_SCAN_ABORT:
72ec70299   Tejun Heo   cgroup: make task...
1242
  				css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1243
1244
1245
  				mem_cgroup_iter_break(memcg, iter);
  				if (chosen)
  					put_task_struct(chosen);
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1246
  				goto unlock;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1247
1248
1249
1250
  			case OOM_SCAN_OK:
  				break;
  			};
  			points = oom_badness(task, memcg, NULL, totalpages);
d49ad9355   David Rientjes   mm, oom: prefer t...
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
  			if (!points || points < chosen_points)
  				continue;
  			/* Prefer thread group leaders for display purposes */
  			if (points == chosen_points &&
  			    thread_group_leader(chosen))
  				continue;
  
  			if (chosen)
  				put_task_struct(chosen);
  			chosen = task;
  			chosen_points = points;
  			get_task_struct(chosen);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1263
  		}
72ec70299   Tejun Heo   cgroup: make task...
1264
  		css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1265
  	}
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1266
1267
  	if (chosen) {
  		points = chosen_points * 1000 / totalpages;
6e0fc46dc   David Rientjes   mm, oom: organize...
1268
1269
  		oom_kill_process(&oc, chosen, points, totalpages, memcg,
  				 "Memory cgroup out of memory");
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1270
1271
1272
  	}
  unlock:
  	mutex_unlock(&oom_lock);
b6e6edcfa   Johannes Weiner   mm: memcontrol: r...
1273
  	return chosen;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1274
  }
ae6e71d3d   Michele Curti   mm/memcontrol.c: ...
1275
  #if MAX_NUMNODES > 1
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1276
1277
  /**
   * test_mem_cgroup_node_reclaimable
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1278
   * @memcg: the target memcg
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1279
1280
1281
1282
1283
1284
1285
   * @nid: the node ID to be checked.
   * @noswap : specify true here if the user wants flle only information.
   *
   * This function returns whether the specified memcg contains any
   * reclaimable pages on a node. Returns true if there are any reclaimable
   * pages in the node.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1286
  static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1287
1288
  		int nid, bool noswap)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1289
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1290
1291
1292
  		return true;
  	if (noswap || !total_swap_pages)
  		return false;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1293
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1294
1295
1296
1297
  		return true;
  	return false;
  
  }
889976dbc   Ying Han   memcg: reclaim me...
1298
1299
1300
1301
1302
1303
1304
  
  /*
   * Always updating the nodemask is not very good - even if we have an empty
   * list or the wrong list here, we can start from some node and traverse all
   * nodes based on the zonelist. So update the list loosely once per 10 secs.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1305
  static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1306
1307
  {
  	int nid;
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1308
1309
1310
1311
  	/*
  	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
  	 * pagein/pageout changes since the last update.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1312
  	if (!atomic_read(&memcg->numainfo_events))
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1313
  		return;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1314
  	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
889976dbc   Ying Han   memcg: reclaim me...
1315
  		return;
889976dbc   Ying Han   memcg: reclaim me...
1316
  	/* make a nodemask where this memcg uses memory from */
31aaea4aa   Lai Jiangshan   memcontrol: use N...
1317
  	memcg->scan_nodes = node_states[N_MEMORY];
889976dbc   Ying Han   memcg: reclaim me...
1318

31aaea4aa   Lai Jiangshan   memcontrol: use N...
1319
  	for_each_node_mask(nid, node_states[N_MEMORY]) {
889976dbc   Ying Han   memcg: reclaim me...
1320

c0ff4b854   Raghavendra K T   memcg: rename mem...
1321
1322
  		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
  			node_clear(nid, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1323
  	}
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1324

c0ff4b854   Raghavendra K T   memcg: rename mem...
1325
1326
  	atomic_set(&memcg->numainfo_events, 0);
  	atomic_set(&memcg->numainfo_updating, 0);
889976dbc   Ying Han   memcg: reclaim me...
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
  }
  
  /*
   * Selecting a node where we start reclaim from. Because what we need is just
   * reducing usage counter, start from anywhere is O,K. Considering
   * memory reclaim from current node, there are pros. and cons.
   *
   * Freeing memory from current node means freeing memory from a node which
   * we'll use or we've used. So, it may make LRU bad. And if several threads
   * hit limits, it will see a contention on a node. But freeing from remote
   * node means more costs for memory reclaim because of memory latency.
   *
   * Now, we use round-robin. Better algorithm is welcomed.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1341
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1342
1343
  {
  	int node;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1344
1345
  	mem_cgroup_may_update_nodemask(memcg);
  	node = memcg->last_scanned_node;
889976dbc   Ying Han   memcg: reclaim me...
1346

0edaf86cf   Andrew Morton   include/linux/nod...
1347
  	node = next_node_in(node, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1348
  	/*
fda3d69be   Michal Hocko   mm/memcontrol.c:m...
1349
1350
1351
  	 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
  	 * last time it really checked all the LRUs due to rate limiting.
  	 * Fallback to the current node in that case for simplicity.
889976dbc   Ying Han   memcg: reclaim me...
1352
1353
1354
  	 */
  	if (unlikely(node == MAX_NUMNODES))
  		node = numa_node_id();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1355
  	memcg->last_scanned_node = node;
889976dbc   Ying Han   memcg: reclaim me...
1356
1357
  	return node;
  }
889976dbc   Ying Han   memcg: reclaim me...
1358
  #else
c0ff4b854   Raghavendra K T   memcg: rename mem...
1359
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1360
1361
1362
1363
  {
  	return 0;
  }
  #endif
0608f43da   Andrew Morton   revert "memcg, vm...
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
  static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
  				   struct zone *zone,
  				   gfp_t gfp_mask,
  				   unsigned long *total_scanned)
  {
  	struct mem_cgroup *victim = NULL;
  	int total = 0;
  	int loop = 0;
  	unsigned long excess;
  	unsigned long nr_scanned;
  	struct mem_cgroup_reclaim_cookie reclaim = {
  		.zone = zone,
  		.priority = 0,
  	};
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1378
  	excess = soft_limit_excess(root_memcg);
0608f43da   Andrew Morton   revert "memcg, vm...
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
  
  	while (1) {
  		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
  		if (!victim) {
  			loop++;
  			if (loop >= 2) {
  				/*
  				 * If we have not been able to reclaim
  				 * anything, it might because there are
  				 * no reclaimable pages under this hierarchy
  				 */
  				if (!total)
  					break;
  				/*
  				 * We want to do more targeted reclaim.
  				 * excess >> 2 is not to excessive so as to
  				 * reclaim too much, nor too less that we keep
  				 * coming back to reclaim from this cgroup
  				 */
  				if (total >= (excess >> 2) ||
  					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
  					break;
  			}
  			continue;
  		}
0608f43da   Andrew Morton   revert "memcg, vm...
1404
1405
1406
  		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
  						     zone, &nr_scanned);
  		*total_scanned += nr_scanned;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1407
  		if (!soft_limit_excess(root_memcg))
0608f43da   Andrew Morton   revert "memcg, vm...
1408
  			break;
6d61ef409   Balbir Singh   memcg: memory cgr...
1409
  	}
0608f43da   Andrew Morton   revert "memcg, vm...
1410
1411
  	mem_cgroup_iter_break(root_memcg, victim);
  	return total;
6d61ef409   Balbir Singh   memcg: memory cgr...
1412
  }
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1413
1414
1415
1416
1417
  #ifdef CONFIG_LOCKDEP
  static struct lockdep_map memcg_oom_lock_dep_map = {
  	.name = "memcg_oom_lock",
  };
  #endif
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1418
  static DEFINE_SPINLOCK(memcg_oom_lock);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1419
1420
1421
1422
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
   */
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1423
  static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1424
  {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1425
  	struct mem_cgroup *iter, *failed = NULL;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1426

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1427
  	spin_lock(&memcg_oom_lock);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1428
  	for_each_mem_cgroup_tree(iter, memcg) {
23751be00   Johannes Weiner   memcg: fix hierar...
1429
  		if (iter->oom_lock) {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1430
1431
1432
1433
  			/*
  			 * this subtree of our hierarchy is already locked
  			 * so we cannot give a lock.
  			 */
79dfdaccd   Michal Hocko   memcg: make oom_l...
1434
  			failed = iter;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1435
1436
  			mem_cgroup_iter_break(memcg, iter);
  			break;
23751be00   Johannes Weiner   memcg: fix hierar...
1437
1438
  		} else
  			iter->oom_lock = true;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1439
  	}
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1440

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
  	if (failed) {
  		/*
  		 * OK, we failed to lock the whole subtree so we have
  		 * to clean up what we set up to the failing subtree
  		 */
  		for_each_mem_cgroup_tree(iter, memcg) {
  			if (iter == failed) {
  				mem_cgroup_iter_break(memcg, iter);
  				break;
  			}
  			iter->oom_lock = false;
79dfdaccd   Michal Hocko   memcg: make oom_l...
1452
  		}
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1453
1454
  	} else
  		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1455
1456
1457
1458
  
  	spin_unlock(&memcg_oom_lock);
  
  	return !failed;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1459
  }
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1460

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1461
  static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1462
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1463
  	struct mem_cgroup *iter;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1464
  	spin_lock(&memcg_oom_lock);
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1465
  	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1466
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1467
  		iter->oom_lock = false;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1468
  	spin_unlock(&memcg_oom_lock);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1469
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1470
  static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1471
1472
  {
  	struct mem_cgroup *iter;
c2b42d3ca   Tejun Heo   memcg: convert me...
1473
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1474
  	for_each_mem_cgroup_tree(iter, memcg)
c2b42d3ca   Tejun Heo   memcg: convert me...
1475
1476
  		iter->under_oom++;
  	spin_unlock(&memcg_oom_lock);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1477
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1478
  static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1479
1480
  {
  	struct mem_cgroup *iter;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1481
1482
  	/*
  	 * When a new child is created while the hierarchy is under oom,
c2b42d3ca   Tejun Heo   memcg: convert me...
1483
  	 * mem_cgroup_oom_lock() may not be called. Watch for underflow.
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1484
  	 */
c2b42d3ca   Tejun Heo   memcg: convert me...
1485
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1486
  	for_each_mem_cgroup_tree(iter, memcg)
c2b42d3ca   Tejun Heo   memcg: convert me...
1487
1488
1489
  		if (iter->under_oom > 0)
  			iter->under_oom--;
  	spin_unlock(&memcg_oom_lock);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1490
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1491
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1492
  struct oom_wait_info {
d79154bb5   Hugh Dickins   memcg: replace me...
1493
  	struct mem_cgroup *memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1494
1495
1496
1497
1498
1499
  	wait_queue_t	wait;
  };
  
  static int memcg_oom_wake_function(wait_queue_t *wait,
  	unsigned mode, int sync, void *arg)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
1500
1501
  	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
  	struct mem_cgroup *oom_wait_memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1502
1503
1504
  	struct oom_wait_info *oom_wait_info;
  
  	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
d79154bb5   Hugh Dickins   memcg: replace me...
1505
  	oom_wait_memcg = oom_wait_info->memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1506

2314b42db   Johannes Weiner   mm: memcontrol: d...
1507
1508
  	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
  	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1509
  		return 0;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1510
1511
  	return autoremove_wake_function(wait, mode, sync, arg);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1512
  static void memcg_oom_recover(struct mem_cgroup *memcg)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1513
  {
c2b42d3ca   Tejun Heo   memcg: convert me...
1514
1515
1516
1517
1518
1519
1520
1521
1522
  	/*
  	 * For the following lockless ->under_oom test, the only required
  	 * guarantee is that it must see the state asserted by an OOM when
  	 * this function is called as a result of userland actions
  	 * triggered by the notification of the OOM.  This is trivially
  	 * achieved by invoking mem_cgroup_mark_under_oom() before
  	 * triggering notification.
  	 */
  	if (memcg && memcg->under_oom)
f4b90b70b   Tejun Heo   memcg: remove unu...
1523
  		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1524
  }
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1525
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1526
  {
1383399d7   Vladimir Davydov   mm: memcontrol: f...
1527
  	if (!current->memcg_may_oom || current->memcg_in_oom)
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1528
  		return;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1529
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
  	 * We are in the middle of the charge context here, so we
  	 * don't want to block when potentially sitting on a callstack
  	 * that holds all kinds of filesystem and mm locks.
  	 *
  	 * Also, the caller may handle a failed allocation gracefully
  	 * (like optional page cache readahead) and so an OOM killer
  	 * invocation might not even be necessary.
  	 *
  	 * That's why we don't do anything here except remember the
  	 * OOM context and then deal with it at the end of the page
  	 * fault when the stack is unwound, the locks are released,
  	 * and when we know whether the fault was overall successful.
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1542
  	 */
494264208   Johannes Weiner   mm: memcg: handle...
1543
  	css_get(&memcg->css);
626ebc410   Tejun Heo   memcg: flatten ta...
1544
1545
1546
  	current->memcg_in_oom = memcg;
  	current->memcg_oom_gfp_mask = mask;
  	current->memcg_oom_order = order;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1547
1548
1549
1550
  }
  
  /**
   * mem_cgroup_oom_synchronize - complete memcg OOM handling
494264208   Johannes Weiner   mm: memcg: handle...
1551
   * @handle: actually kill/wait or just clean up the OOM state
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1552
   *
494264208   Johannes Weiner   mm: memcg: handle...
1553
1554
   * This has to be called at the end of a page fault if the memcg OOM
   * handler was enabled.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1555
   *
494264208   Johannes Weiner   mm: memcg: handle...
1556
   * Memcg supports userspace OOM handling where failed allocations must
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1557
1558
1559
1560
   * sleep on a waitqueue until the userspace task resolves the
   * situation.  Sleeping directly in the charge context with all kinds
   * of locks held is not a good idea, instead we remember an OOM state
   * in the task and mem_cgroup_oom_synchronize() has to be called at
494264208   Johannes Weiner   mm: memcg: handle...
1561
   * the end of the page fault to complete the OOM handling.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1562
1563
   *
   * Returns %true if an ongoing memcg OOM situation was detected and
494264208   Johannes Weiner   mm: memcg: handle...
1564
   * completed, %false otherwise.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1565
   */
494264208   Johannes Weiner   mm: memcg: handle...
1566
  bool mem_cgroup_oom_synchronize(bool handle)
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1567
  {
626ebc410   Tejun Heo   memcg: flatten ta...
1568
  	struct mem_cgroup *memcg = current->memcg_in_oom;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1569
  	struct oom_wait_info owait;
494264208   Johannes Weiner   mm: memcg: handle...
1570
  	bool locked;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1571
1572
  
  	/* OOM is global, do not handle */
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1573
  	if (!memcg)
494264208   Johannes Weiner   mm: memcg: handle...
1574
  		return false;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1575

c32b3cbe0   Michal Hocko   oom, PM: make OOM...
1576
  	if (!handle || oom_killer_disabled)
494264208   Johannes Weiner   mm: memcg: handle...
1577
  		goto cleanup;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1578
1579
1580
1581
1582
1583
  
  	owait.memcg = memcg;
  	owait.wait.flags = 0;
  	owait.wait.func = memcg_oom_wake_function;
  	owait.wait.private = current;
  	INIT_LIST_HEAD(&owait.wait.task_list);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1584

3812c8c8f   Johannes Weiner   mm: memcg: do not...
1585
  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
494264208   Johannes Weiner   mm: memcg: handle...
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
  	mem_cgroup_mark_under_oom(memcg);
  
  	locked = mem_cgroup_oom_trylock(memcg);
  
  	if (locked)
  		mem_cgroup_oom_notify(memcg);
  
  	if (locked && !memcg->oom_kill_disable) {
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
626ebc410   Tejun Heo   memcg: flatten ta...
1596
1597
  		mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
  					 current->memcg_oom_order);
494264208   Johannes Weiner   mm: memcg: handle...
1598
  	} else {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1599
  		schedule();
494264208   Johannes Weiner   mm: memcg: handle...
1600
1601
1602
1603
1604
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
  	}
  
  	if (locked) {
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1605
1606
1607
1608
1609
1610
1611
1612
  		mem_cgroup_oom_unlock(memcg);
  		/*
  		 * There is no guarantee that an OOM-lock contender
  		 * sees the wakeups triggered by the OOM kill
  		 * uncharges.  Wake any sleepers explicitely.
  		 */
  		memcg_oom_recover(memcg);
  	}
494264208   Johannes Weiner   mm: memcg: handle...
1613
  cleanup:
626ebc410   Tejun Heo   memcg: flatten ta...
1614
  	current->memcg_in_oom = NULL;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1615
  	css_put(&memcg->css);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1616
  	return true;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1617
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
1618
  /**
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
1619
1620
   * lock_page_memcg - lock a page->mem_cgroup binding
   * @page: the page
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1621
   *
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
1622
1623
   * This function protects unlocked LRU pages from being moved to
   * another cgroup and stabilizes their page->mem_cgroup binding.
d69b042f3   Balbir Singh   memcg: add file-b...
1624
   */
62cccb8c8   Johannes Weiner   mm: simplify lock...
1625
  void lock_page_memcg(struct page *page)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1626
1627
  {
  	struct mem_cgroup *memcg;
6de226191   Johannes Weiner   mm: memcontrol: t...
1628
  	unsigned long flags;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1629

6de226191   Johannes Weiner   mm: memcontrol: t...
1630
1631
1632
1633
  	/*
  	 * The RCU lock is held throughout the transaction.  The fast
  	 * path can get away without acquiring the memcg->move_lock
  	 * because page moving starts with an RCU grace period.
6de226191   Johannes Weiner   mm: memcontrol: t...
1634
  	 */
d7365e783   Johannes Weiner   mm: memcontrol: f...
1635
1636
1637
  	rcu_read_lock();
  
  	if (mem_cgroup_disabled())
62cccb8c8   Johannes Weiner   mm: simplify lock...
1638
  		return;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1639
  again:
1306a85ae   Johannes Weiner   mm: embed the mem...
1640
  	memcg = page->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
1641
  	if (unlikely(!memcg))
62cccb8c8   Johannes Weiner   mm: simplify lock...
1642
  		return;
d7365e783   Johannes Weiner   mm: memcontrol: f...
1643

bdcbb659f   Qiang Huang   memcg: fold mem_c...
1644
  	if (atomic_read(&memcg->moving_account) <= 0)
62cccb8c8   Johannes Weiner   mm: simplify lock...
1645
  		return;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1646

6de226191   Johannes Weiner   mm: memcontrol: t...
1647
  	spin_lock_irqsave(&memcg->move_lock, flags);
1306a85ae   Johannes Weiner   mm: embed the mem...
1648
  	if (memcg != page->mem_cgroup) {
6de226191   Johannes Weiner   mm: memcontrol: t...
1649
  		spin_unlock_irqrestore(&memcg->move_lock, flags);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1650
1651
  		goto again;
  	}
6de226191   Johannes Weiner   mm: memcontrol: t...
1652
1653
1654
1655
  
  	/*
  	 * When charge migration first begins, we can have locked and
  	 * unlocked page stat updates happening concurrently.  Track
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
1656
  	 * the task who has the lock for unlock_page_memcg().
6de226191   Johannes Weiner   mm: memcontrol: t...
1657
1658
1659
  	 */
  	memcg->move_lock_task = current;
  	memcg->move_lock_flags = flags;
d7365e783   Johannes Weiner   mm: memcontrol: f...
1660

62cccb8c8   Johannes Weiner   mm: simplify lock...
1661
  	return;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1662
  }
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
1663
  EXPORT_SYMBOL(lock_page_memcg);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1664

d7365e783   Johannes Weiner   mm: memcontrol: f...
1665
  /**
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
1666
   * unlock_page_memcg - unlock a page->mem_cgroup binding
62cccb8c8   Johannes Weiner   mm: simplify lock...
1667
   * @page: the page
d7365e783   Johannes Weiner   mm: memcontrol: f...
1668
   */
62cccb8c8   Johannes Weiner   mm: simplify lock...
1669
  void unlock_page_memcg(struct page *page)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1670
  {
62cccb8c8   Johannes Weiner   mm: simplify lock...
1671
  	struct mem_cgroup *memcg = page->mem_cgroup;
6de226191   Johannes Weiner   mm: memcontrol: t...
1672
1673
1674
1675
1676
1677
1678
1679
  	if (memcg && memcg->move_lock_task == current) {
  		unsigned long flags = memcg->move_lock_flags;
  
  		memcg->move_lock_task = NULL;
  		memcg->move_lock_flags = 0;
  
  		spin_unlock_irqrestore(&memcg->move_lock, flags);
  	}
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1680

d7365e783   Johannes Weiner   mm: memcontrol: f...
1681
  	rcu_read_unlock();
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1682
  }
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
1683
  EXPORT_SYMBOL(unlock_page_memcg);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1684

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1685
  /*
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1686
1687
1688
   * size of first charge trial. "32" comes from vmscan.c's magic value.
   * TODO: maybe necessary to use big numbers in big irons.
   */
7ec99d621   Johannes Weiner   memcg: unify char...
1689
  #define CHARGE_BATCH	32U
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1690
1691
  struct memcg_stock_pcp {
  	struct mem_cgroup *cached; /* this never be root cgroup */
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1692
  	unsigned int nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1693
  	struct work_struct work;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1694
  	unsigned long flags;
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
1695
  #define FLUSHING_CACHED_CHARGE	0
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1696
1697
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
9f50fad65   Michal Hocko   Revert "memcg: ge...
1698
  static DEFINE_MUTEX(percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1699

a0956d544   Suleiman Souhlal   memcg: make it po...
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
  /**
   * consume_stock: Try to consume stocked charge on this cpu.
   * @memcg: memcg to consume from.
   * @nr_pages: how many pages to charge.
   *
   * The charges will only happen if @memcg matches the current cpu's memcg
   * stock, and at least @nr_pages are available in that stock.  Failure to
   * service an allocation will refill the stock.
   *
   * returns true if successful, false otherwise.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1710
   */
a0956d544   Suleiman Souhlal   memcg: make it po...
1711
  static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1712
1713
  {
  	struct memcg_stock_pcp *stock;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1714
  	bool ret = false;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1715

a0956d544   Suleiman Souhlal   memcg: make it po...
1716
  	if (nr_pages > CHARGE_BATCH)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1717
  		return ret;
a0956d544   Suleiman Souhlal   memcg: make it po...
1718

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1719
  	stock = &get_cpu_var(memcg_stock);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1720
  	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
a0956d544   Suleiman Souhlal   memcg: make it po...
1721
  		stock->nr_pages -= nr_pages;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1722
1723
  		ret = true;
  	}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1724
1725
1726
1727
1728
  	put_cpu_var(memcg_stock);
  	return ret;
  }
  
  /*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1729
   * Returns stocks cached in percpu and reset cached information.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1730
1731
1732
1733
   */
  static void drain_stock(struct memcg_stock_pcp *stock)
  {
  	struct mem_cgroup *old = stock->cached;
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1734
  	if (stock->nr_pages) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1735
  		page_counter_uncharge(&old->memory, stock->nr_pages);
7941d2145   Johannes Weiner   mm: memcontrol: d...
1736
  		if (do_memsw_account())
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1737
  			page_counter_uncharge(&old->memsw, stock->nr_pages);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
1738
  		css_put_many(&old->css, stock->nr_pages);
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1739
  		stock->nr_pages = 0;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1740
1741
  	}
  	stock->cached = NULL;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1742
1743
1744
1745
1746
1747
1748
1749
  }
  
  /*
   * This must be called under preempt disabled or must be called by
   * a thread which is pinned to local cpu.
   */
  static void drain_local_stock(struct work_struct *dummy)
  {
7c8e0181e   Christoph Lameter   mm: replace __get...
1750
  	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1751
  	drain_stock(stock);
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1752
  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1753
1754
1755
  }
  
  /*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1756
   * Cache charges(val) to local per_cpu area.
320cc51d9   Greg Thelen   mm: fix typo in r...
1757
   * This will be consumed by consume_stock() function, later.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1758
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1759
  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1760
1761
  {
  	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1762
  	if (stock->cached != memcg) { /* reset if necessary */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1763
  		drain_stock(stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1764
  		stock->cached = memcg;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1765
  	}
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1766
  	stock->nr_pages += nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1767
1768
1769
1770
  	put_cpu_var(memcg_stock);
  }
  
  /*
c0ff4b854   Raghavendra K T   memcg: rename mem...
1771
   * Drains all per-CPU charge caches for given root_memcg resp. subtree
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
1772
   * of the hierarchy under it.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1773
   */
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
1774
  static void drain_all_stock(struct mem_cgroup *root_memcg)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1775
  {
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1776
  	int cpu, curcpu;
d38144b7a   Michal Hocko   memcg: unify sync...
1777

6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
1778
1779
1780
  	/* If someone's already draining, avoid adding running more workers. */
  	if (!mutex_trylock(&percpu_charge_mutex))
  		return;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1781
  	/* Notify other cpus that system-wide "drain" is running */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1782
  	get_online_cpus();
5af12d0ef   Johannes Weiner   memcg: pin execut...
1783
  	curcpu = get_cpu();
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1784
1785
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1786
  		struct mem_cgroup *memcg;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1787

c0ff4b854   Raghavendra K T   memcg: rename mem...
1788
1789
  		memcg = stock->cached;
  		if (!memcg || !stock->nr_pages)
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1790
  			continue;
2314b42db   Johannes Weiner   mm: memcontrol: d...
1791
  		if (!mem_cgroup_is_descendant(memcg, root_memcg))
3e92041d6   Michal Hocko   memcg: add mem_cg...
1792
  			continue;
d1a05b697   Michal Hocko   memcg: do not try...
1793
1794
1795
1796
1797
1798
  		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
  			if (cpu == curcpu)
  				drain_local_stock(&stock->work);
  			else
  				schedule_work_on(cpu, &stock->work);
  		}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1799
  	}
5af12d0ef   Johannes Weiner   memcg: pin execut...
1800
  	put_cpu();
f894ffa86   Andrew Morton   memcg: trivial cl...
1801
  	put_online_cpus();
9f50fad65   Michal Hocko   Revert "memcg: ge...
1802
  	mutex_unlock(&percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1803
  }
0db0628d9   Paul Gortmaker   kernel: delete __...
1804
  static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1805
1806
1807
1808
1809
  					unsigned long action,
  					void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
  	struct memcg_stock_pcp *stock;
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1810
  	if (action == CPU_ONLINE)
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1811
  		return NOTIFY_OK;
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1812

d833049bd   Kirill A. Shutemov   memcg: fix broken...
1813
  	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1814
  		return NOTIFY_OK;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1815

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1816
1817
1818
1819
  	stock = &per_cpu(memcg_stock, cpu);
  	drain_stock(stock);
  	return NOTIFY_OK;
  }
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
  static void reclaim_high(struct mem_cgroup *memcg,
  			 unsigned int nr_pages,
  			 gfp_t gfp_mask)
  {
  	do {
  		if (page_counter_read(&memcg->memory) <= memcg->high)
  			continue;
  		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
  		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
  	} while ((memcg = parent_mem_cgroup(memcg)));
  }
  
  static void high_work_func(struct work_struct *work)
  {
  	struct mem_cgroup *memcg;
  
  	memcg = container_of(work, struct mem_cgroup, high_work);
  	reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
  }
b23afb93d   Tejun Heo   memcg: punt high ...
1839
1840
1841
1842
1843
1844
1845
  /*
   * Scheduled by try_charge() to be executed from the userland return path
   * and reclaims memory over the high limit.
   */
  void mem_cgroup_handle_over_high(void)
  {
  	unsigned int nr_pages = current->memcg_nr_pages_over_high;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
1846
  	struct mem_cgroup *memcg;
b23afb93d   Tejun Heo   memcg: punt high ...
1847
1848
1849
  
  	if (likely(!nr_pages))
  		return;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
1850
1851
  	memcg = get_mem_cgroup_from_mm(current->mm);
  	reclaim_high(memcg, nr_pages, GFP_KERNEL);
b23afb93d   Tejun Heo   memcg: punt high ...
1852
1853
1854
  	css_put(&memcg->css);
  	current->memcg_nr_pages_over_high = 0;
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
1855
1856
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
  		      unsigned int nr_pages)
8a9f3ccd2   Balbir Singh   Memory controller...
1857
  {
7ec99d621   Johannes Weiner   memcg: unify char...
1858
  	unsigned int batch = max(CHARGE_BATCH, nr_pages);
9b1306192   Johannes Weiner   mm: memcontrol: r...
1859
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
6539cc053   Johannes Weiner   mm: memcontrol: f...
1860
  	struct mem_cgroup *mem_over_limit;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1861
  	struct page_counter *counter;
6539cc053   Johannes Weiner   mm: memcontrol: f...
1862
  	unsigned long nr_reclaimed;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
1863
1864
  	bool may_swap = true;
  	bool drained = false;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1865

ce00a9673   Johannes Weiner   mm: memcontrol: r...
1866
  	if (mem_cgroup_is_root(memcg))
10d53c748   Tejun Heo   memcg: ratify and...
1867
  		return 0;
6539cc053   Johannes Weiner   mm: memcontrol: f...
1868
  retry:
b6b6cc72b   Michal Hocko   memcg: do not rep...
1869
  	if (consume_stock(memcg, nr_pages))
10d53c748   Tejun Heo   memcg: ratify and...
1870
  		return 0;
8a9f3ccd2   Balbir Singh   Memory controller...
1871

7941d2145   Johannes Weiner   mm: memcontrol: d...
1872
  	if (!do_memsw_account() ||
6071ca520   Johannes Weiner   mm: page_counter:...
1873
1874
  	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
  		if (page_counter_try_charge(&memcg->memory, batch, &counter))
6539cc053   Johannes Weiner   mm: memcontrol: f...
1875
  			goto done_restock;
7941d2145   Johannes Weiner   mm: memcontrol: d...
1876
  		if (do_memsw_account())
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1877
1878
  			page_counter_uncharge(&memcg->memsw, batch);
  		mem_over_limit = mem_cgroup_from_counter(counter, memory);
3fbe72442   Johannes Weiner   mm: memcontrol: s...
1879
  	} else {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1880
  		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
1881
  		may_swap = false;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
1882
  	}
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1883

6539cc053   Johannes Weiner   mm: memcontrol: f...
1884
1885
1886
1887
  	if (batch > nr_pages) {
  		batch = nr_pages;
  		goto retry;
  	}
6d61ef409   Balbir Singh   memcg: memory cgr...
1888

06b078fc0   Johannes Weiner   mm: memcontrol: r...
1889
1890
1891
1892
1893
1894
1895
1896
1897
  	/*
  	 * Unlike in global OOM situations, memcg is not in a physical
  	 * memory shortage.  Allow dying and OOM-killed tasks to
  	 * bypass the last charges so that they can exit quickly and
  	 * free their memory.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
  		     fatal_signal_pending(current) ||
  		     current->flags & PF_EXITING))
10d53c748   Tejun Heo   memcg: ratify and...
1898
  		goto force;
06b078fc0   Johannes Weiner   mm: memcontrol: r...
1899
1900
1901
  
  	if (unlikely(task_in_memcg_oom(current)))
  		goto nomem;
d0164adc8   Mel Gorman   mm, page_alloc: d...
1902
  	if (!gfpflags_allow_blocking(gfp_mask))
6539cc053   Johannes Weiner   mm: memcontrol: f...
1903
  		goto nomem;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1904

241994ed8   Johannes Weiner   mm: memcontrol: d...
1905
  	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
1906
1907
  	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
  						    gfp_mask, may_swap);
6539cc053   Johannes Weiner   mm: memcontrol: f...
1908

61e02c745   Johannes Weiner   mm: memcontrol: c...
1909
  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
6539cc053   Johannes Weiner   mm: memcontrol: f...
1910
  		goto retry;
28c34c291   Johannes Weiner   mm: memcontrol: r...
1911

b70a2a21d   Johannes Weiner   mm: memcontrol: f...
1912
  	if (!drained) {
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
1913
  		drain_all_stock(mem_over_limit);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
1914
1915
1916
  		drained = true;
  		goto retry;
  	}
28c34c291   Johannes Weiner   mm: memcontrol: r...
1917
1918
  	if (gfp_mask & __GFP_NORETRY)
  		goto nomem;
6539cc053   Johannes Weiner   mm: memcontrol: f...
1919
1920
1921
1922
1923
1924
1925
1926
1927
  	/*
  	 * Even though the limit is exceeded at this point, reclaim
  	 * may have been able to free some pages.  Retry the charge
  	 * before killing the task.
  	 *
  	 * Only for regular pages, though: huge pages are rather
  	 * unlikely to succeed so close to the limit, and we fall back
  	 * to regular pages anyway in case of failure.
  	 */
61e02c745   Johannes Weiner   mm: memcontrol: c...
1928
  	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
6539cc053   Johannes Weiner   mm: memcontrol: f...
1929
1930
1931
1932
1933
1934
1935
  		goto retry;
  	/*
  	 * At task move, charge accounts can be doubly counted. So, it's
  	 * better to wait until the end of task_move if something is going on.
  	 */
  	if (mem_cgroup_wait_acct_move(mem_over_limit))
  		goto retry;
9b1306192   Johannes Weiner   mm: memcontrol: r...
1936
1937
  	if (nr_retries--)
  		goto retry;
06b078fc0   Johannes Weiner   mm: memcontrol: r...
1938
  	if (gfp_mask & __GFP_NOFAIL)
10d53c748   Tejun Heo   memcg: ratify and...
1939
  		goto force;
06b078fc0   Johannes Weiner   mm: memcontrol: r...
1940

6539cc053   Johannes Weiner   mm: memcontrol: f...
1941
  	if (fatal_signal_pending(current))
10d53c748   Tejun Heo   memcg: ratify and...
1942
  		goto force;
6539cc053   Johannes Weiner   mm: memcontrol: f...
1943

241994ed8   Johannes Weiner   mm: memcontrol: d...
1944
  	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
3608de078   Jerome Marchand   mm/memcontrol.c: ...
1945
1946
  	mem_cgroup_oom(mem_over_limit, gfp_mask,
  		       get_order(nr_pages * PAGE_SIZE));
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1947
  nomem:
6d1fdc489   Johannes Weiner   memcg: sanitize _...
1948
  	if (!(gfp_mask & __GFP_NOFAIL))
3168ecbe1   Johannes Weiner   mm: memcg: use pr...
1949
  		return -ENOMEM;
10d53c748   Tejun Heo   memcg: ratify and...
1950
1951
1952
1953
1954
1955
1956
  force:
  	/*
  	 * The allocation either can't fail or will lead to more memory
  	 * being freed very soon.  Allow memory usage go over the limit
  	 * temporarily by force charging it.
  	 */
  	page_counter_charge(&memcg->memory, nr_pages);
7941d2145   Johannes Weiner   mm: memcontrol: d...
1957
  	if (do_memsw_account())
10d53c748   Tejun Heo   memcg: ratify and...
1958
1959
1960
1961
  		page_counter_charge(&memcg->memsw, nr_pages);
  	css_get_many(&memcg->css, nr_pages);
  
  	return 0;
6539cc053   Johannes Weiner   mm: memcontrol: f...
1962
1963
  
  done_restock:
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
1964
  	css_get_many(&memcg->css, batch);
6539cc053   Johannes Weiner   mm: memcontrol: f...
1965
1966
  	if (batch > nr_pages)
  		refill_stock(memcg, batch - nr_pages);
b23afb93d   Tejun Heo   memcg: punt high ...
1967

241994ed8   Johannes Weiner   mm: memcontrol: d...
1968
  	/*
b23afb93d   Tejun Heo   memcg: punt high ...
1969
1970
  	 * If the hierarchy is above the normal consumption range, schedule
  	 * reclaim on returning to userland.  We can perform reclaim here
71baba4b9   Mel Gorman   mm, page_alloc: r...
1971
  	 * if __GFP_RECLAIM but let's always punt for simplicity and so that
b23afb93d   Tejun Heo   memcg: punt high ...
1972
1973
1974
1975
  	 * GFP_KERNEL can consistently be used during reclaim.  @memcg is
  	 * not recorded as it most likely matches current's and won't
  	 * change in the meantime.  As high limit is checked again before
  	 * reclaim, the cost of mismatch is negligible.
241994ed8   Johannes Weiner   mm: memcontrol: d...
1976
1977
  	 */
  	do {
b23afb93d   Tejun Heo   memcg: punt high ...
1978
  		if (page_counter_read(&memcg->memory) > memcg->high) {
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
1979
1980
1981
1982
1983
  			/* Don't bother a random interrupted task */
  			if (in_interrupt()) {
  				schedule_work(&memcg->high_work);
  				break;
  			}
9516a18a9   Vladimir Davydov   memcg: fix memory...
1984
  			current->memcg_nr_pages_over_high += batch;
b23afb93d   Tejun Heo   memcg: punt high ...
1985
1986
1987
  			set_notify_resume(current);
  			break;
  		}
241994ed8   Johannes Weiner   mm: memcontrol: d...
1988
  	} while ((memcg = parent_mem_cgroup(memcg)));
10d53c748   Tejun Heo   memcg: ratify and...
1989
1990
  
  	return 0;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
1991
  }
8a9f3ccd2   Balbir Singh   Memory controller...
1992

00501b531   Johannes Weiner   mm: memcontrol: r...
1993
  static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
1994
  {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
1995
1996
  	if (mem_cgroup_is_root(memcg))
  		return;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1997
  	page_counter_uncharge(&memcg->memory, nr_pages);
7941d2145   Johannes Weiner   mm: memcontrol: d...
1998
  	if (do_memsw_account())
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1999
  		page_counter_uncharge(&memcg->memsw, nr_pages);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2000

e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2001
  	css_put_many(&memcg->css, nr_pages);
d01dd17f1   KAMEZAWA Hiroyuki   memcg: use res_co...
2002
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
  static void lock_page_lru(struct page *page, int *isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	spin_lock_irq(&zone->lru_lock);
  	if (PageLRU(page)) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		ClearPageLRU(page);
  		del_page_from_lru_list(page, lruvec, page_lru(page));
  		*isolated = 1;
  	} else
  		*isolated = 0;
  }
  
  static void unlock_page_lru(struct page *page, int isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	if (isolated) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		SetPageLRU(page);
  		add_page_to_lru_list(page, lruvec, page_lru(page));
  	}
  	spin_unlock_irq(&zone->lru_lock);
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
2033
  static void commit_charge(struct page *page, struct mem_cgroup *memcg,
6abb5a867   Johannes Weiner   mm: memcontrol: a...
2034
  			  bool lrucare)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2035
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2036
  	int isolated;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2037

1306a85ae   Johannes Weiner   mm: embed the mem...
2038
  	VM_BUG_ON_PAGE(page->mem_cgroup, page);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2039
2040
2041
2042
2043
  
  	/*
  	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
  	 * may already be on some other mem_cgroup's LRU.  Take care of it.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2044
2045
  	if (lrucare)
  		lock_page_lru(page, &isolated);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2046

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2047
2048
  	/*
  	 * Nobody should be changing or seriously looking at
1306a85ae   Johannes Weiner   mm: embed the mem...
2049
  	 * page->mem_cgroup at this point:
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
  	 *
  	 * - the page is uncharged
  	 *
  	 * - the page is off-LRU
  	 *
  	 * - an anonymous fault has exclusive page access, except for
  	 *   a locked page table
  	 *
  	 * - a page cache insertion, a swapin fault, or a migration
  	 *   have the page locked
  	 */
1306a85ae   Johannes Weiner   mm: embed the mem...
2061
  	page->mem_cgroup = memcg;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2062

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2063
2064
  	if (lrucare)
  		unlock_page_lru(page, isolated);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2065
  }
66e1707bc   Balbir Singh   Memory controller...
2066

127424c86   Johannes Weiner   mm: memcontrol: m...
2067
  #ifndef CONFIG_SLOB
f3bb3043a   Vladimir Davydov   memcg: don't call...
2068
  static int memcg_alloc_cache_id(void)
55007d849   Glauber Costa   memcg: allocate m...
2069
  {
f3bb3043a   Vladimir Davydov   memcg: don't call...
2070
2071
  	int id, size;
  	int err;
dbcf73e26   Vladimir Davydov   memcg: rename som...
2072
  	id = ida_simple_get(&memcg_cache_ida,
f3bb3043a   Vladimir Davydov   memcg: don't call...
2073
2074
2075
  			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
  	if (id < 0)
  		return id;
55007d849   Glauber Costa   memcg: allocate m...
2076

dbcf73e26   Vladimir Davydov   memcg: rename som...
2077
  	if (id < memcg_nr_cache_ids)
f3bb3043a   Vladimir Davydov   memcg: don't call...
2078
2079
2080
2081
2082
2083
  		return id;
  
  	/*
  	 * There's no space for the new id in memcg_caches arrays,
  	 * so we have to grow them.
  	 */
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2084
  	down_write(&memcg_cache_ids_sem);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2085
2086
  
  	size = 2 * (id + 1);
55007d849   Glauber Costa   memcg: allocate m...
2087
2088
2089
2090
  	if (size < MEMCG_CACHES_MIN_SIZE)
  		size = MEMCG_CACHES_MIN_SIZE;
  	else if (size > MEMCG_CACHES_MAX_SIZE)
  		size = MEMCG_CACHES_MAX_SIZE;
f3bb3043a   Vladimir Davydov   memcg: don't call...
2091
  	err = memcg_update_all_caches(size);
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2092
  	if (!err)
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2093
2094
  		err = memcg_update_all_list_lrus(size);
  	if (!err)
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2095
2096
2097
  		memcg_nr_cache_ids = size;
  
  	up_write(&memcg_cache_ids_sem);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2098
  	if (err) {
dbcf73e26   Vladimir Davydov   memcg: rename som...
2099
  		ida_simple_remove(&memcg_cache_ida, id);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2100
2101
2102
2103
2104
2105
2106
  		return err;
  	}
  	return id;
  }
  
  static void memcg_free_cache_id(int id)
  {
dbcf73e26   Vladimir Davydov   memcg: rename som...
2107
  	ida_simple_remove(&memcg_cache_ida, id);
55007d849   Glauber Costa   memcg: allocate m...
2108
  }
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2109
  struct memcg_kmem_cache_create_work {
5722d094a   Vladimir Davydov   memcg, slab: clea...
2110
2111
2112
2113
  	struct mem_cgroup *memcg;
  	struct kmem_cache *cachep;
  	struct work_struct work;
  };
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2114
  static void memcg_kmem_cache_create_func(struct work_struct *w)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2115
  {
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2116
2117
  	struct memcg_kmem_cache_create_work *cw =
  		container_of(w, struct memcg_kmem_cache_create_work, work);
5722d094a   Vladimir Davydov   memcg, slab: clea...
2118
2119
  	struct mem_cgroup *memcg = cw->memcg;
  	struct kmem_cache *cachep = cw->cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2120

d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2121
  	memcg_create_kmem_cache(memcg, cachep);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2122

5722d094a   Vladimir Davydov   memcg, slab: clea...
2123
  	css_put(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2124
2125
2126
2127
2128
  	kfree(cw);
  }
  
  /*
   * Enqueue the creation of a per-memcg kmem_cache.
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2129
   */
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2130
2131
  static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  					       struct kmem_cache *cachep)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2132
  {
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2133
  	struct memcg_kmem_cache_create_work *cw;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2134

776ed0f03   Vladimir Davydov   memcg: cleanup km...
2135
  	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2136
  	if (!cw)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2137
  		return;
8135be5a8   Vladimir Davydov   memcg: fix possib...
2138
2139
  
  	css_get(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2140
2141
2142
  
  	cw->memcg = memcg;
  	cw->cachep = cachep;
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2143
  	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2144

d7f25f8a2   Glauber Costa   memcg: infrastruc...
2145
2146
  	schedule_work(&cw->work);
  }
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2147
2148
  static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  					     struct kmem_cache *cachep)
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2149
2150
2151
2152
  {
  	/*
  	 * We need to stop accounting when we kmalloc, because if the
  	 * corresponding kmalloc cache is not yet created, the first allocation
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2153
  	 * in __memcg_schedule_kmem_cache_create will recurse.
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2154
2155
2156
2157
2158
2159
2160
  	 *
  	 * However, it is better to enclose the whole function. Depending on
  	 * the debugging options enabled, INIT_WORK(), for instance, can
  	 * trigger an allocation. This too, will make us recurse. Because at
  	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
  	 * the safest choice is to do it like this, wrapping the whole function.
  	 */
6f185c290   Vladimir Davydov   memcg: turn memcg...
2161
  	current->memcg_kmem_skip_account = 1;
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2162
  	__memcg_schedule_kmem_cache_create(memcg, cachep);
6f185c290   Vladimir Davydov   memcg: turn memcg...
2163
  	current->memcg_kmem_skip_account = 0;
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2164
  }
c67a8a685   Vladimir Davydov   memcg, slab: merg...
2165

d7f25f8a2   Glauber Costa   memcg: infrastruc...
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
  /*
   * Return the kmem_cache we're supposed to use for a slab allocation.
   * We try to use the current memcg's version of the cache.
   *
   * If the cache does not exist yet, if we are the first user of it,
   * we either create it immediately, if possible, or create it asynchronously
   * in a workqueue.
   * In the latter case, we will let the current allocation go through with
   * the original cache.
   *
   * Can't be called in interrupt context or from kernel threads.
   * This function needs to be called with rcu_read_lock() held.
   */
230e9fc28   Vladimir Davydov   slab: add SLAB_AC...
2179
  struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2180
2181
  {
  	struct mem_cgroup *memcg;
959c8963f   Vladimir Davydov   memcg, slab: fix ...
2182
  	struct kmem_cache *memcg_cachep;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2183
  	int kmemcg_id;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2184

f7ce3190c   Vladimir Davydov   slab: embed memcg...
2185
  	VM_BUG_ON(!is_root_cache(cachep));
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2186

230e9fc28   Vladimir Davydov   slab: add SLAB_AC...
2187
2188
2189
2190
2191
  	if (cachep->flags & SLAB_ACCOUNT)
  		gfp |= __GFP_ACCOUNT;
  
  	if (!(gfp & __GFP_ACCOUNT))
  		return cachep;
9d100c5e4   Vladimir Davydov   memcg: don't chec...
2192
  	if (current->memcg_kmem_skip_account)
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2193
  		return cachep;
8135be5a8   Vladimir Davydov   memcg: fix possib...
2194
  	memcg = get_mem_cgroup_from_mm(current->mm);
4db0c3c29   Jason Low   mm: remove rest o...
2195
  	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2196
  	if (kmemcg_id < 0)
ca0dde971   Li Zefan   memcg: take refer...
2197
  		goto out;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2198

2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2199
  	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2200
2201
  	if (likely(memcg_cachep))
  		return memcg_cachep;
ca0dde971   Li Zefan   memcg: take refer...
2202
2203
2204
2205
2206
2207
2208
2209
2210
  
  	/*
  	 * If we are in a safe context (can wait, and not in interrupt
  	 * context), we could be be predictable and return right away.
  	 * This would guarantee that the allocation being performed
  	 * already belongs in the new cache.
  	 *
  	 * However, there are some clashes that can arrive from locking.
  	 * For instance, because we acquire the slab_mutex while doing
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2211
2212
2213
  	 * memcg_create_kmem_cache, this means no further allocation
  	 * could happen with the slab_mutex held. So it's better to
  	 * defer everything.
ca0dde971   Li Zefan   memcg: take refer...
2214
  	 */
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2215
  	memcg_schedule_kmem_cache_create(memcg, cachep);
ca0dde971   Li Zefan   memcg: take refer...
2216
  out:
8135be5a8   Vladimir Davydov   memcg: fix possib...
2217
  	css_put(&memcg->css);
ca0dde971   Li Zefan   memcg: take refer...
2218
  	return cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2219
  }
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2220

8135be5a8   Vladimir Davydov   memcg: fix possib...
2221
2222
2223
  void __memcg_kmem_put_cache(struct kmem_cache *cachep)
  {
  	if (!is_root_cache(cachep))
f7ce3190c   Vladimir Davydov   slab: embed memcg...
2224
  		css_put(&cachep->memcg_params.memcg->css);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2225
  }
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2226
2227
  int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
  			      struct mem_cgroup *memcg)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2228
  {
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2229
2230
  	unsigned int nr_pages = 1 << order;
  	struct page_counter *counter;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2231
  	int ret;
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2232
  	ret = try_charge(memcg, gfp, nr_pages);
52c29b048   Johannes Weiner   mm: memcontrol: a...
2233
  	if (ret)
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2234
  		return ret;
52c29b048   Johannes Weiner   mm: memcontrol: a...
2235
2236
2237
2238
2239
  
  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
  	    !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
  		cancel_charge(memcg, nr_pages);
  		return -ENOMEM;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2240
  	}
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2241
  	page->mem_cgroup = memcg;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2242

f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2243
  	return 0;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2244
  }
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2245
  int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2246
  {
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2247
  	struct mem_cgroup *memcg;
fcff7d7ee   Vladimir Davydov   mm: memcontrol: d...
2248
  	int ret = 0;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2249

f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2250
  	memcg = get_mem_cgroup_from_mm(current->mm);
b6ecd2dea   Vladimir Davydov   mm: memcontrol: z...
2251
  	if (!mem_cgroup_is_root(memcg))
fcff7d7ee   Vladimir Davydov   mm: memcontrol: d...
2252
  		ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2253
  	css_put(&memcg->css);
d05e83a6f   Vladimir Davydov   memcg: simplify c...
2254
  	return ret;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2255
  }
d05e83a6f   Vladimir Davydov   memcg: simplify c...
2256
  void __memcg_kmem_uncharge(struct page *page, int order)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2257
  {
1306a85ae   Johannes Weiner   mm: embed the mem...
2258
  	struct mem_cgroup *memcg = page->mem_cgroup;
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2259
  	unsigned int nr_pages = 1 << order;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2260

7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2261
2262
  	if (!memcg)
  		return;
309381fea   Sasha Levin   mm: dump page whe...
2263
  	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
298333157   Johannes Weiner   mm: memcontrol: r...
2264

52c29b048   Johannes Weiner   mm: memcontrol: a...
2265
2266
  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
  		page_counter_uncharge(&memcg->kmem, nr_pages);
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2267
  	page_counter_uncharge(&memcg->memory, nr_pages);
7941d2145   Johannes Weiner   mm: memcontrol: d...
2268
  	if (do_memsw_account())
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2269
  		page_counter_uncharge(&memcg->memsw, nr_pages);
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2270

1306a85ae   Johannes Weiner   mm: embed the mem...
2271
  	page->mem_cgroup = NULL;
f3ccb2c42   Vladimir Davydov   memcg: unify slab...
2272
  	css_put_many(&memcg->css, nr_pages);
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2273
  }
127424c86   Johannes Weiner   mm: memcontrol: m...
2274
  #endif /* !CONFIG_SLOB */
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2275

ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2276
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2277
2278
  /*
   * Because tail pages are not marked as "used", set it. We're under
3ac808fdd   Kirill A. Shutemov   mm, thp: remove c...
2279
   * zone->lru_lock and migration entries setup in all page mappings.
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2280
   */
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2281
  void mem_cgroup_split_huge_fixup(struct page *head)
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2282
  {
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2283
  	int i;
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2284

3d37c4a91   KAMEZAWA Hiroyuki   memcg: bugfix che...
2285
2286
  	if (mem_cgroup_disabled())
  		return;
b070e65c0   David Rientjes   mm, memcg: add rs...
2287

298333157   Johannes Weiner   mm: memcontrol: r...
2288
  	for (i = 1; i < HPAGE_PMD_NR; i++)
1306a85ae   Johannes Weiner   mm: embed the mem...
2289
  		head[i].mem_cgroup = head->mem_cgroup;
b9982f8d2   Michal Hocko   mm: memcontrol: m...
2290

1306a85ae   Johannes Weiner   mm: embed the mem...
2291
  	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
b070e65c0   David Rientjes   mm, memcg: add rs...
2292
  		       HPAGE_PMD_NR);
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2293
  }
12d271078   Hugh Dickins   memcg: fix split_...
2294
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2295

c255a4580   Andrew Morton   memcg: rename con...
2296
  #ifdef CONFIG_MEMCG_SWAP
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2297
2298
  static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
  					 bool charge)
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2299
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2300
2301
  	int val = (charge) ? 1 : -1;
  	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2302
  }
024914477   Daisuke Nishimura   memcg: move charg...
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
  
  /**
   * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   * @entry: swap entry to be moved
   * @from:  mem_cgroup which the entry is moved from
   * @to:  mem_cgroup which the entry is moved to
   *
   * It succeeds only when the swap_cgroup's record for this entry is the same
   * as the mem_cgroup's id of @from.
   *
   * Returns 0 on success, -EINVAL on failure.
   *
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2315
   * The caller must have charged to @to, IOW, called page_counter_charge() about
024914477   Daisuke Nishimura   memcg: move charg...
2316
2317
2318
   * both res and memsw, and called css_get().
   */
  static int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
2319
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
2320
2321
  {
  	unsigned short old_id, new_id;
34c00c319   Li Zefan   memcg: convert to...
2322
2323
  	old_id = mem_cgroup_id(from);
  	new_id = mem_cgroup_id(to);
024914477   Daisuke Nishimura   memcg: move charg...
2324
2325
  
  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
024914477   Daisuke Nishimura   memcg: move charg...
2326
  		mem_cgroup_swap_statistics(from, false);
483c30b51   Daisuke Nishimura   memcg: improve pe...
2327
  		mem_cgroup_swap_statistics(to, true);
024914477   Daisuke Nishimura   memcg: move charg...
2328
2329
2330
2331
2332
2333
  		return 0;
  	}
  	return -EINVAL;
  }
  #else
  static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
2334
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
2335
2336
2337
  {
  	return -EINVAL;
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2338
  #endif
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2339

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2340
  static DEFINE_MUTEX(memcg_limit_mutex);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
2341

d38d2a758   KOSAKI Motohiro   mm: make mem_cgro...
2342
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2343
  				   unsigned long limit)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2344
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2345
2346
2347
  	unsigned long curusage;
  	unsigned long oldusage;
  	bool enlarge = false;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2348
  	int retry_count;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2349
  	int ret;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2350
2351
2352
2353
2354
2355
  
  	/*
  	 * For keeping hierarchical_reclaim simple, how long we should retry
  	 * is depends on callers. We set our retry-count to be function
  	 * of # of children which we should visit in this loop.
  	 */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2356
2357
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
  		      mem_cgroup_count_children(memcg);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2358

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2359
  	oldusage = page_counter_read(&memcg->memory);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2360

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2361
  	do {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2362
2363
2364
2365
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2366
2367
2368
2369
  
  		mutex_lock(&memcg_limit_mutex);
  		if (limit > memcg->memsw.limit) {
  			mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2370
  			ret = -EINVAL;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2371
2372
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2373
2374
2375
2376
  		if (limit > memcg->memory.limit)
  			enlarge = true;
  		ret = page_counter_limit(&memcg->memory, limit);
  		mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2377
2378
2379
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2380
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2381
  		curusage = page_counter_read(&memcg->memory);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2382
  		/* Usage is reduced ? */
f894ffa86   Andrew Morton   memcg: trivial cl...
2383
  		if (curusage >= oldusage)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2384
2385
2386
  			retry_count--;
  		else
  			oldusage = curusage;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2387
  	} while (retry_count);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2388
2389
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
14797e236   KOSAKI Motohiro   memcg: add inacti...
2390

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2391
2392
  	return ret;
  }
338c84310   Li Zefan   memcg: remove som...
2393
  static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2394
  					 unsigned long limit)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2395
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2396
2397
2398
  	unsigned long curusage;
  	unsigned long oldusage;
  	bool enlarge = false;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2399
  	int retry_count;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2400
  	int ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2401

81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2402
  	/* see mem_cgroup_resize_res_limit */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2403
2404
2405
2406
2407
2408
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
  		      mem_cgroup_count_children(memcg);
  
  	oldusage = page_counter_read(&memcg->memsw);
  
  	do {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2409
2410
2411
2412
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2413
2414
2415
2416
  
  		mutex_lock(&memcg_limit_mutex);
  		if (limit < memcg->memory.limit) {
  			mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2417
  			ret = -EINVAL;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2418
2419
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2420
2421
2422
2423
  		if (limit > memcg->memsw.limit)
  			enlarge = true;
  		ret = page_counter_limit(&memcg->memsw, limit);
  		mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2424
2425
2426
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2427
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2428
  		curusage = page_counter_read(&memcg->memsw);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2429
  		/* Usage is reduced ? */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2430
  		if (curusage >= oldusage)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2431
  			retry_count--;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2432
2433
  		else
  			oldusage = curusage;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2434
  	} while (retry_count);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2435
2436
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2437

628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2438
2439
  	return ret;
  }
0608f43da   Andrew Morton   revert "memcg, vm...
2440
2441
2442
2443
2444
2445
2446
2447
2448
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  					    gfp_t gfp_mask,
  					    unsigned long *total_scanned)
  {
  	unsigned long nr_reclaimed = 0;
  	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
  	unsigned long reclaimed;
  	int loop = 0;
  	struct mem_cgroup_tree_per_zone *mctz;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2449
  	unsigned long excess;
0608f43da   Andrew Morton   revert "memcg, vm...
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
  	unsigned long nr_scanned;
  
  	if (order > 0)
  		return 0;
  
  	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
  	/*
  	 * This loop can run a while, specially if mem_cgroup's continuously
  	 * keep exceeding their soft limit and putting the system under
  	 * pressure
  	 */
  	do {
  		if (next_mz)
  			mz = next_mz;
  		else
  			mz = mem_cgroup_largest_soft_limit_node(mctz);
  		if (!mz)
  			break;
  
  		nr_scanned = 0;
  		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
  						    gfp_mask, &nr_scanned);
  		nr_reclaimed += reclaimed;
  		*total_scanned += nr_scanned;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2474
  		spin_lock_irq(&mctz->lock);
bc2f2e7ff   Vladimir Davydov   memcg: simplify u...
2475
  		__mem_cgroup_remove_exceeded(mz, mctz);
0608f43da   Andrew Morton   revert "memcg, vm...
2476
2477
2478
2479
2480
2481
  
  		/*
  		 * If we failed to reclaim anything from this memory cgroup
  		 * it is time to move on to the next cgroup
  		 */
  		next_mz = NULL;
bc2f2e7ff   Vladimir Davydov   memcg: simplify u...
2482
2483
  		if (!reclaimed)
  			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2484
  		excess = soft_limit_excess(mz->memcg);
0608f43da   Andrew Morton   revert "memcg, vm...
2485
2486
2487
2488
2489
2490
2491
2492
2493
  		/*
  		 * One school of thought says that we should not add
  		 * back the node to the tree if reclaim returns 0.
  		 * But our reclaim could return 0, simply because due
  		 * to priority we are exposing a smaller subset of
  		 * memory to reclaim from. Consider this as a longer
  		 * term TODO.
  		 */
  		/* If excess == 0, no tree ops */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
2494
  		__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2495
  		spin_unlock_irq(&mctz->lock);
0608f43da   Andrew Morton   revert "memcg, vm...
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
  		css_put(&mz->memcg->css);
  		loop++;
  		/*
  		 * Could not reclaim anything and there are no more
  		 * mem cgroups to try or we seem to be looping without
  		 * reclaiming anything.
  		 */
  		if (!nr_reclaimed &&
  			(next_mz == NULL ||
  			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
  			break;
  	} while (!nr_reclaimed);
  	if (next_mz)
  		css_put(&next_mz->memcg->css);
  	return nr_reclaimed;
  }
ea280e7b4   Tejun Heo   memcg: update mem...
2512
2513
2514
2515
2516
2517
  /*
   * Test whether @memcg has children, dead or alive.  Note that this
   * function doesn't care whether @memcg has use_hierarchy enabled and
   * returns %true if there are child csses according to the cgroup
   * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
   */
b5f99b537   Glauber Costa   memcg: fast hiera...
2518
2519
  static inline bool memcg_has_children(struct mem_cgroup *memcg)
  {
ea280e7b4   Tejun Heo   memcg: update mem...
2520
  	bool ret;
ea280e7b4   Tejun Heo   memcg: update mem...
2521
2522
2523
2524
  	rcu_read_lock();
  	ret = css_next_child(NULL, &memcg->css);
  	rcu_read_unlock();
  	return ret;
b5f99b537   Glauber Costa   memcg: fast hiera...
2525
2526
2527
  }
  
  /*
51038171b   Greg Thelen   memcg: fix stale ...
2528
   * Reclaims as many pages from the given memcg as possible.
c26251f9f   Michal Hocko   memcg: split mem_...
2529
2530
2531
2532
2533
2534
   *
   * Caller is responsible for holding css reference for memcg.
   */
  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
  {
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
c26251f9f   Michal Hocko   memcg: split mem_...
2535

c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2536
2537
  	/* we call try-to-free pages for make this cgroup empty */
  	lru_add_drain_all();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2538
  	/* try to free all pages in this cgroup */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2539
  	while (nr_retries && page_counter_read(&memcg->memory)) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2540
  		int progress;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2541

c26251f9f   Michal Hocko   memcg: split mem_...
2542
2543
  		if (signal_pending(current))
  			return -EINTR;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2544
2545
  		progress = try_to_free_mem_cgroup_pages(memcg, 1,
  							GFP_KERNEL, true);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2546
  		if (!progress) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2547
  			nr_retries--;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2548
  			/* maybe some writeback is necessary */
8aa7e847d   Jens Axboe   Fix congestion_wa...
2549
  			congestion_wait(BLK_RW_ASYNC, HZ/10);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2550
  		}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2551
2552
  
  	}
ab5196c20   Michal Hocko   memcg: make mem_c...
2553
2554
  
  	return 0;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2555
  }
6770c64e5   Tejun Heo   cgroup: replace c...
2556
2557
2558
  static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
  					    char *buf, size_t nbytes,
  					    loff_t off)
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2559
  {
6770c64e5   Tejun Heo   cgroup: replace c...
2560
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
c26251f9f   Michal Hocko   memcg: split mem_...
2561

d84230118   Michal Hocko   memcg: root_cgrou...
2562
2563
  	if (mem_cgroup_is_root(memcg))
  		return -EINVAL;
6770c64e5   Tejun Heo   cgroup: replace c...
2564
  	return mem_cgroup_force_empty(memcg) ?: nbytes;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2565
  }
182446d08   Tejun Heo   cgroup: pass arou...
2566
2567
  static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
  				     struct cftype *cft)
18f59ea7d   Balbir Singh   memcg: memory cgr...
2568
  {
182446d08   Tejun Heo   cgroup: pass arou...
2569
  	return mem_cgroup_from_css(css)->use_hierarchy;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2570
  }
182446d08   Tejun Heo   cgroup: pass arou...
2571
2572
  static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
  				      struct cftype *cft, u64 val)
18f59ea7d   Balbir Singh   memcg: memory cgr...
2573
2574
  {
  	int retval = 0;
182446d08   Tejun Heo   cgroup: pass arou...
2575
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5c9d535b8   Tejun Heo   cgroup: remove cs...
2576
  	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
2577

567fb435b   Glauber Costa   memcg: fix bad be...
2578
  	if (memcg->use_hierarchy == val)
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
2579
  		return 0;
567fb435b   Glauber Costa   memcg: fix bad be...
2580

18f59ea7d   Balbir Singh   memcg: memory cgr...
2581
  	/*
af901ca18   André Goddard Rosa   tree-wide: fix as...
2582
  	 * If parent's use_hierarchy is set, we can't make any modifications
18f59ea7d   Balbir Singh   memcg: memory cgr...
2583
2584
2585
2586
2587
2588
  	 * in the child subtrees. If it is unset, then the change can
  	 * occur, provided the current cgroup has no children.
  	 *
  	 * For the root cgroup, parent_mem is NULL, we allow value to be
  	 * set if there are no children.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2589
  	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
18f59ea7d   Balbir Singh   memcg: memory cgr...
2590
  				(val == 1 || val == 0)) {
ea280e7b4   Tejun Heo   memcg: update mem...
2591
  		if (!memcg_has_children(memcg))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2592
  			memcg->use_hierarchy = val;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2593
2594
2595
2596
  		else
  			retval = -EBUSY;
  	} else
  		retval = -EINVAL;
567fb435b   Glauber Costa   memcg: fix bad be...
2597

18f59ea7d   Balbir Singh   memcg: memory cgr...
2598
2599
  	return retval;
  }
72b54e731   Vladimir Davydov   mm: memcontrol: m...
2600
  static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2601
2602
  {
  	struct mem_cgroup *iter;
72b54e731   Vladimir Davydov   mm: memcontrol: m...
2603
  	int i;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2604

72b54e731   Vladimir Davydov   mm: memcontrol: m...
2605
  	memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2606

72b54e731   Vladimir Davydov   mm: memcontrol: m...
2607
2608
2609
2610
  	for_each_mem_cgroup_tree(iter, memcg) {
  		for (i = 0; i < MEMCG_NR_STAT; i++)
  			stat[i] += mem_cgroup_read_stat(iter, i);
  	}
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2611
  }
72b54e731   Vladimir Davydov   mm: memcontrol: m...
2612
  static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
587d9f726   Johannes Weiner   mm: memcontrol: b...
2613
2614
  {
  	struct mem_cgroup *iter;
72b54e731   Vladimir Davydov   mm: memcontrol: m...
2615
  	int i;
587d9f726   Johannes Weiner   mm: memcontrol: b...
2616

72b54e731   Vladimir Davydov   mm: memcontrol: m...
2617
  	memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
587d9f726   Johannes Weiner   mm: memcontrol: b...
2618

72b54e731   Vladimir Davydov   mm: memcontrol: m...
2619
2620
2621
2622
  	for_each_mem_cgroup_tree(iter, memcg) {
  		for (i = 0; i < MEMCG_NR_EVENTS; i++)
  			events[i] += mem_cgroup_read_events(iter, i);
  	}
587d9f726   Johannes Weiner   mm: memcontrol: b...
2623
  }
6f6461562   Andrew Morton   mm/memcontrol.c: ...
2624
  static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2625
  {
72b54e731   Vladimir Davydov   mm: memcontrol: m...
2626
  	unsigned long val = 0;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2627

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2628
  	if (mem_cgroup_is_root(memcg)) {
72b54e731   Vladimir Davydov   mm: memcontrol: m...
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
  		struct mem_cgroup *iter;
  
  		for_each_mem_cgroup_tree(iter, memcg) {
  			val += mem_cgroup_read_stat(iter,
  					MEM_CGROUP_STAT_CACHE);
  			val += mem_cgroup_read_stat(iter,
  					MEM_CGROUP_STAT_RSS);
  			if (swap)
  				val += mem_cgroup_read_stat(iter,
  						MEM_CGROUP_STAT_SWAP);
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2640
  	} else {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2641
  		if (!swap)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2642
  			val = page_counter_read(&memcg->memory);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2643
  		else
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2644
  			val = page_counter_read(&memcg->memsw);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2645
  	}
c12176d33   Michal Hocko   memcg: fix thresh...
2646
  	return val;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2647
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2648
2649
2650
2651
2652
2653
2654
  enum {
  	RES_USAGE,
  	RES_LIMIT,
  	RES_MAX_USAGE,
  	RES_FAILCNT,
  	RES_SOFT_LIMIT,
  };
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2655

791badbdb   Tejun Heo   memcg: convert aw...
2656
  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
05b843012   Johannes Weiner   mm: memcontrol: u...
2657
  			       struct cftype *cft)
8cdea7c05   Balbir Singh   Memory controller...
2658
  {
182446d08   Tejun Heo   cgroup: pass arou...
2659
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2660
  	struct page_counter *counter;
af36f906c   Tejun Heo   memcg: always cre...
2661

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2662
  	switch (MEMFILE_TYPE(cft->private)) {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2663
  	case _MEM:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2664
2665
  		counter = &memcg->memory;
  		break;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2666
  	case _MEMSWAP:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2667
2668
  		counter = &memcg->memsw;
  		break;
510fc4e11   Glauber Costa   memcg: kmem accou...
2669
  	case _KMEM:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2670
  		counter = &memcg->kmem;
510fc4e11   Glauber Costa   memcg: kmem accou...
2671
  		break;
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2672
  	case _TCP:
0db152981   Johannes Weiner   mm: memcontrol: f...
2673
  		counter = &memcg->tcpmem;
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2674
  		break;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2675
2676
  	default:
  		BUG();
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2677
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2678
2679
2680
2681
  
  	switch (MEMFILE_ATTR(cft->private)) {
  	case RES_USAGE:
  		if (counter == &memcg->memory)
c12176d33   Michal Hocko   memcg: fix thresh...
2682
  			return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2683
  		if (counter == &memcg->memsw)
c12176d33   Michal Hocko   memcg: fix thresh...
2684
  			return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
  		return (u64)page_counter_read(counter) * PAGE_SIZE;
  	case RES_LIMIT:
  		return (u64)counter->limit * PAGE_SIZE;
  	case RES_MAX_USAGE:
  		return (u64)counter->watermark * PAGE_SIZE;
  	case RES_FAILCNT:
  		return counter->failcnt;
  	case RES_SOFT_LIMIT:
  		return (u64)memcg->soft_limit * PAGE_SIZE;
  	default:
  		BUG();
  	}
8cdea7c05   Balbir Singh   Memory controller...
2697
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
2698

127424c86   Johannes Weiner   mm: memcontrol: m...
2699
  #ifndef CONFIG_SLOB
567e9ab2e   Johannes Weiner   mm: memcontrol: g...
2700
  static int memcg_online_kmem(struct mem_cgroup *memcg)
d64416377   Vladimir Davydov   memcg: rework mem...
2701
  {
d64416377   Vladimir Davydov   memcg: rework mem...
2702
  	int memcg_id;
b313aeee2   Vladimir Davydov   mm: memcontrol: e...
2703
2704
  	if (cgroup_memory_nokmem)
  		return 0;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2705
  	BUG_ON(memcg->kmemcg_id >= 0);
567e9ab2e   Johannes Weiner   mm: memcontrol: g...
2706
  	BUG_ON(memcg->kmem_state);
d64416377   Vladimir Davydov   memcg: rework mem...
2707

f3bb3043a   Vladimir Davydov   memcg: don't call...
2708
  	memcg_id = memcg_alloc_cache_id();
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
2709
2710
  	if (memcg_id < 0)
  		return memcg_id;
d64416377   Vladimir Davydov   memcg: rework mem...
2711

ef12947c9   Johannes Weiner   mm: memcontrol: s...
2712
  	static_branch_inc(&memcg_kmem_enabled_key);
d64416377   Vladimir Davydov   memcg: rework mem...
2713
  	/*
567e9ab2e   Johannes Weiner   mm: memcontrol: g...
2714
  	 * A memory cgroup is considered kmem-online as soon as it gets
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
2715
  	 * kmemcg_id. Setting the id after enabling static branching will
d64416377   Vladimir Davydov   memcg: rework mem...
2716
2717
2718
  	 * guarantee no one starts accounting before all call sites are
  	 * patched.
  	 */
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
2719
  	memcg->kmemcg_id = memcg_id;
567e9ab2e   Johannes Weiner   mm: memcontrol: g...
2720
  	memcg->kmem_state = KMEM_ONLINE;
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
2721
2722
  
  	return 0;
d64416377   Vladimir Davydov   memcg: rework mem...
2723
  }
8e0a89121   Johannes Weiner   mm: memcontrol: g...
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
  static void memcg_offline_kmem(struct mem_cgroup *memcg)
  {
  	struct cgroup_subsys_state *css;
  	struct mem_cgroup *parent, *child;
  	int kmemcg_id;
  
  	if (memcg->kmem_state != KMEM_ONLINE)
  		return;
  	/*
  	 * Clear the online state before clearing memcg_caches array
  	 * entries. The slab_mutex in memcg_deactivate_kmem_caches()
  	 * guarantees that no cache will be created for this cgroup
  	 * after we are done (see memcg_create_kmem_cache()).
  	 */
  	memcg->kmem_state = KMEM_ALLOCATED;
  
  	memcg_deactivate_kmem_caches(memcg);
  
  	kmemcg_id = memcg->kmemcg_id;
  	BUG_ON(kmemcg_id < 0);
  
  	parent = parent_mem_cgroup(memcg);
  	if (!parent)
  		parent = root_mem_cgroup;
  
  	/*
  	 * Change kmemcg_id of this cgroup and all its descendants to the
  	 * parent's id, and then move all entries from this cgroup's list_lrus
  	 * to ones of the parent. After we have finished, all list_lrus
  	 * corresponding to this cgroup are guaranteed to remain empty. The
  	 * ordering is imposed by list_lru_node->lock taken by
  	 * memcg_drain_all_list_lrus().
  	 */
  	css_for_each_descendant_pre(css, &memcg->css) {
  		child = mem_cgroup_from_css(css);
  		BUG_ON(child->kmemcg_id != kmemcg_id);
  		child->kmemcg_id = parent->kmemcg_id;
  		if (!memcg->use_hierarchy)
  			break;
  	}
  	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
  
  	memcg_free_cache_id(kmemcg_id);
  }
  
  static void memcg_free_kmem(struct mem_cgroup *memcg)
  {
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
2771
2772
2773
  	/* css_alloc() failed, offlining didn't happen */
  	if (unlikely(memcg->kmem_state == KMEM_ONLINE))
  		memcg_offline_kmem(memcg);
8e0a89121   Johannes Weiner   mm: memcontrol: g...
2774
2775
2776
2777
2778
  	if (memcg->kmem_state == KMEM_ALLOCATED) {
  		memcg_destroy_kmem_caches(memcg);
  		static_branch_dec(&memcg_kmem_enabled_key);
  		WARN_ON(page_counter_read(&memcg->kmem));
  	}
8e0a89121   Johannes Weiner   mm: memcontrol: g...
2779
  }
d64416377   Vladimir Davydov   memcg: rework mem...
2780
  #else
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
2781
  static int memcg_online_kmem(struct mem_cgroup *memcg)
127424c86   Johannes Weiner   mm: memcontrol: m...
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
  {
  	return 0;
  }
  static void memcg_offline_kmem(struct mem_cgroup *memcg)
  {
  }
  static void memcg_free_kmem(struct mem_cgroup *memcg)
  {
  }
  #endif /* !CONFIG_SLOB */
d64416377   Vladimir Davydov   memcg: rework mem...
2792
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2793
  				   unsigned long limit)
d64416377   Vladimir Davydov   memcg: rework mem...
2794
  {
b313aeee2   Vladimir Davydov   mm: memcontrol: e...
2795
  	int ret;
127424c86   Johannes Weiner   mm: memcontrol: m...
2796
2797
  
  	mutex_lock(&memcg_limit_mutex);
127424c86   Johannes Weiner   mm: memcontrol: m...
2798
  	ret = page_counter_limit(&memcg->kmem, limit);
127424c86   Johannes Weiner   mm: memcontrol: m...
2799
2800
  	mutex_unlock(&memcg_limit_mutex);
  	return ret;
d64416377   Vladimir Davydov   memcg: rework mem...
2801
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
2802

d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2803
2804
2805
2806
2807
  static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
  {
  	int ret;
  
  	mutex_lock(&memcg_limit_mutex);
0db152981   Johannes Weiner   mm: memcontrol: f...
2808
  	ret = page_counter_limit(&memcg->tcpmem, limit);
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2809
2810
  	if (ret)
  		goto out;
0db152981   Johannes Weiner   mm: memcontrol: f...
2811
  	if (!memcg->tcpmem_active) {
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
  		/*
  		 * The active flag needs to be written after the static_key
  		 * update. This is what guarantees that the socket activation
  		 * function is the last one to run. See sock_update_memcg() for
  		 * details, and note that we don't mark any socket as belonging
  		 * to this memcg until that flag is up.
  		 *
  		 * We need to do this, because static_keys will span multiple
  		 * sites, but we can't control their order. If we mark a socket
  		 * as accounted, but the accounting functions are not patched in
  		 * yet, we'll lose accounting.
  		 *
  		 * We never race with the readers in sock_update_memcg(),
  		 * because when this value change, the code to process it is not
  		 * patched in yet.
  		 */
  		static_branch_inc(&memcg_sockets_enabled_key);
0db152981   Johannes Weiner   mm: memcontrol: f...
2829
  		memcg->tcpmem_active = true;
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2830
2831
2832
2833
2834
  	}
  out:
  	mutex_unlock(&memcg_limit_mutex);
  	return ret;
  }
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2835

628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2836
2837
2838
2839
  /*
   * The user of this function is...
   * RES_LIMIT.
   */
451af504d   Tejun Heo   cgroup: replace c...
2840
2841
  static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
8cdea7c05   Balbir Singh   Memory controller...
2842
  {
451af504d   Tejun Heo   cgroup: replace c...
2843
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2844
  	unsigned long nr_pages;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2845
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
2846
  	buf = strstrip(buf);
650c5e565   Johannes Weiner   mm: page_counter:...
2847
  	ret = page_counter_memparse(buf, "-1", &nr_pages);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2848
2849
  	if (ret)
  		return ret;
af36f906c   Tejun Heo   memcg: always cre...
2850

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2851
  	switch (MEMFILE_ATTR(of_cft(of)->private)) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2852
  	case RES_LIMIT:
4b3bde4c9   Balbir Singh   memcg: remove the...
2853
2854
2855
2856
  		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
  			ret = -EINVAL;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2857
2858
2859
  		switch (MEMFILE_TYPE(of_cft(of)->private)) {
  		case _MEM:
  			ret = mem_cgroup_resize_limit(memcg, nr_pages);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2860
  			break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2861
2862
  		case _MEMSWAP:
  			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
296c81d89   Balbir Singh   memory controller...
2863
  			break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2864
2865
2866
  		case _KMEM:
  			ret = memcg_update_kmem_limit(memcg, nr_pages);
  			break;
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2867
2868
2869
  		case _TCP:
  			ret = memcg_update_tcp_limit(memcg, nr_pages);
  			break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2870
  		}
296c81d89   Balbir Singh   memory controller...
2871
  		break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2872
2873
2874
  	case RES_SOFT_LIMIT:
  		memcg->soft_limit = nr_pages;
  		ret = 0;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2875
2876
  		break;
  	}
451af504d   Tejun Heo   cgroup: replace c...
2877
  	return ret ?: nbytes;
8cdea7c05   Balbir Singh   Memory controller...
2878
  }
6770c64e5   Tejun Heo   cgroup: replace c...
2879
2880
  static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
  				size_t nbytes, loff_t off)
c84872e16   Pavel Emelyanov   memcgroup: add th...
2881
  {
6770c64e5   Tejun Heo   cgroup: replace c...
2882
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2883
  	struct page_counter *counter;
c84872e16   Pavel Emelyanov   memcgroup: add th...
2884

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
  	switch (MEMFILE_TYPE(of_cft(of)->private)) {
  	case _MEM:
  		counter = &memcg->memory;
  		break;
  	case _MEMSWAP:
  		counter = &memcg->memsw;
  		break;
  	case _KMEM:
  		counter = &memcg->kmem;
  		break;
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2895
  	case _TCP:
0db152981   Johannes Weiner   mm: memcontrol: f...
2896
  		counter = &memcg->tcpmem;
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
2897
  		break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2898
2899
2900
  	default:
  		BUG();
  	}
af36f906c   Tejun Heo   memcg: always cre...
2901

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2902
  	switch (MEMFILE_ATTR(of_cft(of)->private)) {
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
2903
  	case RES_MAX_USAGE:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2904
  		page_counter_reset_watermark(counter);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
2905
2906
  		break;
  	case RES_FAILCNT:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2907
  		counter->failcnt = 0;
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
2908
  		break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2909
2910
  	default:
  		BUG();
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
2911
  	}
f64c3f549   Balbir Singh   memory controller...
2912

6770c64e5   Tejun Heo   cgroup: replace c...
2913
  	return nbytes;
c84872e16   Pavel Emelyanov   memcgroup: add th...
2914
  }
182446d08   Tejun Heo   cgroup: pass arou...
2915
  static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
2916
2917
  					struct cftype *cft)
  {
182446d08   Tejun Heo   cgroup: pass arou...
2918
  	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
2919
  }
024914477   Daisuke Nishimura   memcg: move charg...
2920
  #ifdef CONFIG_MMU
182446d08   Tejun Heo   cgroup: pass arou...
2921
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
2922
2923
  					struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
2924
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7dc74be03   Daisuke Nishimura   memcg: add interf...
2925

1dfab5abc   Johannes Weiner   mm: memcontrol: f...
2926
  	if (val & ~MOVE_MASK)
7dc74be03   Daisuke Nishimura   memcg: add interf...
2927
  		return -EINVAL;
ee5e8472b   Glauber Costa   memcg: prevent ch...
2928

7dc74be03   Daisuke Nishimura   memcg: add interf...
2929
  	/*
ee5e8472b   Glauber Costa   memcg: prevent ch...
2930
2931
2932
2933
  	 * No kind of locking is needed in here, because ->can_attach() will
  	 * check this value once in the beginning of the process, and then carry
  	 * on with stale data. This means that changes to this value will only
  	 * affect task migrations starting after the change.
7dc74be03   Daisuke Nishimura   memcg: add interf...
2934
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2935
  	memcg->move_charge_at_immigrate = val;
7dc74be03   Daisuke Nishimura   memcg: add interf...
2936
2937
  	return 0;
  }
024914477   Daisuke Nishimura   memcg: move charg...
2938
  #else
182446d08   Tejun Heo   cgroup: pass arou...
2939
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
024914477   Daisuke Nishimura   memcg: move charg...
2940
2941
2942
2943
2944
  					struct cftype *cft, u64 val)
  {
  	return -ENOSYS;
  }
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
2945

406eb0c9b   Ying Han   memcg: add memory...
2946
  #ifdef CONFIG_NUMA
2da8ca822   Tejun Heo   cgroup: replace c...
2947
  static int memcg_numa_stat_show(struct seq_file *m, void *v)
406eb0c9b   Ying Han   memcg: add memory...
2948
  {
25485de6e   Greg Thelen   memcg: refactor m...
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
  	struct numa_stat {
  		const char *name;
  		unsigned int lru_mask;
  	};
  
  	static const struct numa_stat stats[] = {
  		{ "total", LRU_ALL },
  		{ "file", LRU_ALL_FILE },
  		{ "anon", LRU_ALL_ANON },
  		{ "unevictable", BIT(LRU_UNEVICTABLE) },
  	};
  	const struct numa_stat *stat;
406eb0c9b   Ying Han   memcg: add memory...
2961
  	int nid;
25485de6e   Greg Thelen   memcg: refactor m...
2962
  	unsigned long nr;
2da8ca822   Tejun Heo   cgroup: replace c...
2963
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
406eb0c9b   Ying Han   memcg: add memory...
2964

25485de6e   Greg Thelen   memcg: refactor m...
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
  		seq_printf(m, "%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
  							  stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
2975
  	}
406eb0c9b   Ying Han   memcg: add memory...
2976

071aee138   Ying Han   memcg: support hi...
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		struct mem_cgroup *iter;
  
  		nr = 0;
  		for_each_mem_cgroup_tree(iter, memcg)
  			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
  		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = 0;
  			for_each_mem_cgroup_tree(iter, memcg)
  				nr += mem_cgroup_node_nr_lru_pages(
  					iter, nid, stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
2993
  	}
406eb0c9b   Ying Han   memcg: add memory...
2994

406eb0c9b   Ying Han   memcg: add memory...
2995
2996
2997
  	return 0;
  }
  #endif /* CONFIG_NUMA */
2da8ca822   Tejun Heo   cgroup: replace c...
2998
  static int memcg_stat_show(struct seq_file *m, void *v)
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
2999
  {
2da8ca822   Tejun Heo   cgroup: replace c...
3000
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3001
  	unsigned long memory, memsw;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3002
3003
  	struct mem_cgroup *mi;
  	unsigned int i;
406eb0c9b   Ying Han   memcg: add memory...
3004

0ca44b148   Greg Thelen   memcg: add BUILD_...
3005
3006
3007
3008
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
  		     MEM_CGROUP_STAT_NSTATS);
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
  		     MEM_CGROUP_EVENTS_NSTATS);
70bc068c4   Rickard Strandqvist   mm/memcontrol.c: ...
3009
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3010
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
7941d2145   Johannes Weiner   mm: memcontrol: d...
3011
  		if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3012
  			continue;
484ebb3b8   Greg Thelen   memcg: make mem_c...
3013
3014
  		seq_printf(m, "%s %lu
  ", mem_cgroup_stat_names[i],
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3015
  			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3016
  	}
7b854121e   Lee Schermerhorn   Unevictable LRU P...
3017

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3018
3019
3020
3021
3022
3023
3024
3025
3026
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_events_names[i],
  			   mem_cgroup_read_events(memcg, i));
  
  	for (i = 0; i < NR_LRU_LISTS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_lru_names[i],
  			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3027
  	/* Hierarchical information */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3028
3029
3030
3031
  	memory = memsw = PAGE_COUNTER_MAX;
  	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
  		memory = min(memory, mi->memory.limit);
  		memsw = min(memsw, mi->memsw.limit);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3032
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3033
3034
3035
  	seq_printf(m, "hierarchical_memory_limit %llu
  ",
  		   (u64)memory * PAGE_SIZE);
7941d2145   Johannes Weiner   mm: memcontrol: d...
3036
  	if (do_memsw_account())
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3037
3038
3039
  		seq_printf(m, "hierarchical_memsw_limit %llu
  ",
  			   (u64)memsw * PAGE_SIZE);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3040

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3041
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
484ebb3b8   Greg Thelen   memcg: make mem_c...
3042
  		unsigned long long val = 0;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3043

7941d2145   Johannes Weiner   mm: memcontrol: d...
3044
  		if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3045
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3046
3047
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
484ebb3b8   Greg Thelen   memcg: make mem_c...
3048
3049
  		seq_printf(m, "total_%s %llu
  ", mem_cgroup_stat_names[i], val);
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
  	}
  
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_events(mi, i);
  		seq_printf(m, "total_%s %llu
  ",
  			   mem_cgroup_events_names[i], val);
  	}
  
  	for (i = 0; i < NR_LRU_LISTS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
  		seq_printf(m, "total_%s %llu
  ", mem_cgroup_lru_names[i], val);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3069
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3070

7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3071
  #ifdef CONFIG_DEBUG_VM
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3072
3073
3074
  	{
  		int nid, zid;
  		struct mem_cgroup_per_zone *mz;
89abfab13   Hugh Dickins   mm/memcg: move re...
3075
  		struct zone_reclaim_stat *rstat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3076
3077
3078
3079
3080
  		unsigned long recent_rotated[2] = {0, 0};
  		unsigned long recent_scanned[2] = {0, 0};
  
  		for_each_online_node(nid)
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
3081
  				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
89abfab13   Hugh Dickins   mm/memcg: move re...
3082
  				rstat = &mz->lruvec.reclaim_stat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3083

89abfab13   Hugh Dickins   mm/memcg: move re...
3084
3085
3086
3087
  				recent_rotated[0] += rstat->recent_rotated[0];
  				recent_rotated[1] += rstat->recent_rotated[1];
  				recent_scanned[0] += rstat->recent_scanned[0];
  				recent_scanned[1] += rstat->recent_scanned[1];
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3088
  			}
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
3089
3090
3091
3092
3093
3094
3095
3096
  		seq_printf(m, "recent_rotated_anon %lu
  ", recent_rotated[0]);
  		seq_printf(m, "recent_rotated_file %lu
  ", recent_rotated[1]);
  		seq_printf(m, "recent_scanned_anon %lu
  ", recent_scanned[0]);
  		seq_printf(m, "recent_scanned_file %lu
  ", recent_scanned[1]);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3097
3098
  	}
  #endif
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3099
3100
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
3101
3102
  static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
  				      struct cftype *cft)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3103
  {
182446d08   Tejun Heo   cgroup: pass arou...
3104
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3105

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
3106
  	return mem_cgroup_swappiness(memcg);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3107
  }
182446d08   Tejun Heo   cgroup: pass arou...
3108
3109
  static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
  				       struct cftype *cft, u64 val)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3110
  {
182446d08   Tejun Heo   cgroup: pass arou...
3111
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3112

3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3113
  	if (val > 100)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3114
  		return -EINVAL;
14208b0ec   Linus Torvalds   Merge branch 'for...
3115
  	if (css->parent)
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3116
3117
3118
  		memcg->swappiness = val;
  	else
  		vm_swappiness = val;
068b38c1f   Li Zefan   memcg: fix a race...
3119

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3120
3121
  	return 0;
  }
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3122
3123
3124
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
  	struct mem_cgroup_threshold_ary *t;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3125
  	unsigned long usage;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3126
3127
3128
3129
  	int i;
  
  	rcu_read_lock();
  	if (!swap)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3130
  		t = rcu_dereference(memcg->thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3131
  	else
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3132
  		t = rcu_dereference(memcg->memsw_thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3133
3134
3135
  
  	if (!t)
  		goto unlock;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3136
  	usage = mem_cgroup_usage(memcg, swap);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3137
3138
  
  	/*
748dad36d   Sha Zhengju   memcg: make thres...
3139
  	 * current_threshold points to threshold just below or equal to usage.
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3140
3141
3142
  	 * If it's not true, a threshold was crossed after last
  	 * call of __mem_cgroup_threshold().
  	 */
5407a5625   Phil Carmody   mm: remove unnece...
3143
  	i = t->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
  
  	/*
  	 * Iterate backward over array of thresholds starting from
  	 * current_threshold and check if a threshold is crossed.
  	 * If none of thresholds below usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* i = current_threshold + 1 */
  	i++;
  
  	/*
  	 * Iterate forward over array of thresholds starting from
  	 * current_threshold+1 and check if a threshold is crossed.
  	 * If none of thresholds above usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* Update current_threshold */
5407a5625   Phil Carmody   mm: remove unnece...
3167
  	t->current_threshold = i - 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3168
3169
3170
3171
3172
3173
  unlock:
  	rcu_read_unlock();
  }
  
  static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  {
ad4ca5f4b   Kirill A. Shutemov   memcg: fix thresh...
3174
3175
  	while (memcg) {
  		__mem_cgroup_threshold(memcg, false);
7941d2145   Johannes Weiner   mm: memcontrol: d...
3176
  		if (do_memsw_account())
ad4ca5f4b   Kirill A. Shutemov   memcg: fix thresh...
3177
3178
3179
3180
  			__mem_cgroup_threshold(memcg, true);
  
  		memcg = parent_mem_cgroup(memcg);
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3181
3182
3183
3184
3185
3186
  }
  
  static int compare_thresholds(const void *a, const void *b)
  {
  	const struct mem_cgroup_threshold *_a = a;
  	const struct mem_cgroup_threshold *_b = b;
2bff24a37   Greg Thelen   memcg: fix multip...
3187
3188
3189
3190
3191
3192
3193
  	if (_a->threshold > _b->threshold)
  		return 1;
  
  	if (_a->threshold < _b->threshold)
  		return -1;
  
  	return 0;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3194
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3195
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3196
3197
  {
  	struct mem_cgroup_eventfd_list *ev;
2bcf2e92c   Michal Hocko   memcg: oom_notify...
3198
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
3199
  	list_for_each_entry(ev, &memcg->oom_notify, list)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3200
  		eventfd_signal(ev->eventfd, 1);
2bcf2e92c   Michal Hocko   memcg: oom_notify...
3201
3202
  
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3203
3204
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3205
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3206
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3207
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3208
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3209
  		mem_cgroup_oom_notify_cb(iter);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3210
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3211
  static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3212
  	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3213
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3214
3215
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3216
3217
  	unsigned long threshold;
  	unsigned long usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3218
  	int i, size, ret;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3219

650c5e565   Johannes Weiner   mm: page_counter:...
3220
  	ret = page_counter_memparse(args, "-1", &threshold);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3221
3222
3223
3224
  	if (ret)
  		return ret;
  
  	mutex_lock(&memcg->thresholds_lock);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3225

05b843012   Johannes Weiner   mm: memcontrol: u...
3226
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3227
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3228
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
3229
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3230
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3231
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
3232
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3233
  		BUG();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3234
  	/* Check if a threshold crossed before adding a new one */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3235
  	if (thresholds->primary)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3236
  		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3237
  	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3238
3239
  
  	/* Allocate memory for new array of thresholds */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3240
  	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3241
  			GFP_KERNEL);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3242
  	if (!new) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3243
3244
3245
  		ret = -ENOMEM;
  		goto unlock;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3246
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3247
3248
  
  	/* Copy thresholds (if any) to new array */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3249
3250
  	if (thresholds->primary) {
  		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3251
  				sizeof(struct mem_cgroup_threshold));
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3252
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3253
  	/* Add new threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3254
3255
  	new->entries[size - 1].eventfd = eventfd;
  	new->entries[size - 1].threshold = threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3256
3257
  
  	/* Sort thresholds. Registering of new threshold isn't time-critical */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3258
  	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3259
3260
3261
  			compare_thresholds, NULL);
  
  	/* Find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3262
  	new->current_threshold = -1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3263
  	for (i = 0; i < size; i++) {
748dad36d   Sha Zhengju   memcg: make thres...
3264
  		if (new->entries[i].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3265
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3266
3267
  			 * new->current_threshold will not be used until
  			 * rcu_assign_pointer(), so it's safe to increment
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3268
3269
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3270
  			++new->current_threshold;
748dad36d   Sha Zhengju   memcg: make thres...
3271
3272
  		} else
  			break;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3273
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3274
3275
3276
3277
3278
  	/* Free old spare buffer and save old primary buffer as spare */
  	kfree(thresholds->spare);
  	thresholds->spare = thresholds->primary;
  
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3279

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3280
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3281
  	synchronize_rcu();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3282
3283
3284
3285
3286
  unlock:
  	mutex_unlock(&memcg->thresholds_lock);
  
  	return ret;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3287
  static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3288
3289
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3290
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
3291
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3292
  static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3293
3294
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3295
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
3296
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3297
  static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3298
  	struct eventfd_ctx *eventfd, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3299
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3300
3301
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3302
  	unsigned long usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3303
  	int i, j, size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3304
3305
  
  	mutex_lock(&memcg->thresholds_lock);
05b843012   Johannes Weiner   mm: memcontrol: u...
3306
3307
  
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3308
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3309
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
3310
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3311
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3312
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
3313
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3314
  		BUG();
371528cae   Anton Vorontsov   mm: memcg: Correc...
3315
3316
  	if (!thresholds->primary)
  		goto unlock;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3317
3318
3319
3320
  	/* Check if a threshold crossed before removing */
  	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
  
  	/* Calculate new number of threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3321
3322
3323
  	size = 0;
  	for (i = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd != eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3324
3325
  			size++;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3326
  	new = thresholds->spare;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3327

2e72b6347   Kirill A. Shutemov   memcg: implement ...
3328
3329
  	/* Set thresholds array to NULL if we don't have thresholds */
  	if (!size) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3330
3331
  		kfree(new);
  		new = NULL;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3332
  		goto swap_buffers;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3333
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3334
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3335
3336
  
  	/* Copy thresholds and find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3337
3338
3339
  	new->current_threshold = -1;
  	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd == eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3340
  			continue;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3341
  		new->entries[j] = thresholds->primary->entries[i];
748dad36d   Sha Zhengju   memcg: make thres...
3342
  		if (new->entries[j].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3343
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3344
  			 * new->current_threshold will not be used
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3345
3346
3347
  			 * until rcu_assign_pointer(), so it's safe to increment
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3348
  			++new->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3349
3350
3351
  		}
  		j++;
  	}
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3352
  swap_buffers:
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3353
3354
  	/* Swap primary and spare array */
  	thresholds->spare = thresholds->primary;
8c7577637   Sha Zhengju   memcg: free spare...
3355

2c488db27   Kirill A. Shutemov   memcg: clean up m...
3356
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3357

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3358
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3359
  	synchronize_rcu();
6611d8d76   Martijn Coenen   memcg: only free ...
3360
3361
3362
3363
3364
3365
  
  	/* If all events are unregistered, free the spare array */
  	if (!new) {
  		kfree(thresholds->spare);
  		thresholds->spare = NULL;
  	}
371528cae   Anton Vorontsov   mm: memcg: Correc...
3366
  unlock:
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3367
  	mutex_unlock(&memcg->thresholds_lock);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3368
  }
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3369

59b6f8734   Tejun Heo   memcg: make cgrou...
3370
  static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3371
3372
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3373
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
3374
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3375
  static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3376
3377
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3378
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
3379
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3380
  static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3381
  	struct eventfd_ctx *eventfd, const char *args)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3382
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3383
  	struct mem_cgroup_eventfd_list *event;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3384

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3385
3386
3387
  	event = kmalloc(sizeof(*event),	GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
1af8efe96   Michal Hocko   memcg: change mem...
3388
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3389
3390
3391
3392
3393
  
  	event->eventfd = eventfd;
  	list_add(&event->list, &memcg->oom_notify);
  
  	/* already in OOM ? */
c2b42d3ca   Tejun Heo   memcg: convert me...
3394
  	if (memcg->under_oom)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3395
  		eventfd_signal(eventfd, 1);
1af8efe96   Michal Hocko   memcg: change mem...
3396
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3397
3398
3399
  
  	return 0;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3400
  static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3401
  	struct eventfd_ctx *eventfd)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3402
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3403
  	struct mem_cgroup_eventfd_list *ev, *tmp;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3404

1af8efe96   Michal Hocko   memcg: change mem...
3405
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3406

c0ff4b854   Raghavendra K T   memcg: rename mem...
3407
  	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3408
3409
3410
3411
3412
  		if (ev->eventfd == eventfd) {
  			list_del(&ev->list);
  			kfree(ev);
  		}
  	}
1af8efe96   Michal Hocko   memcg: change mem...
3413
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3414
  }
2da8ca822   Tejun Heo   cgroup: replace c...
3415
  static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3416
  {
2da8ca822   Tejun Heo   cgroup: replace c...
3417
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3418

791badbdb   Tejun Heo   memcg: convert aw...
3419
3420
  	seq_printf(sf, "oom_kill_disable %d
  ", memcg->oom_kill_disable);
c2b42d3ca   Tejun Heo   memcg: convert me...
3421
3422
  	seq_printf(sf, "under_oom %d
  ", (bool)memcg->under_oom);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3423
3424
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
3425
  static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3426
3427
  	struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
3428
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3429
3430
  
  	/* cannot set to root cgroup and only 0 and 1 are allowed */
14208b0ec   Linus Torvalds   Merge branch 'for...
3431
  	if (!css->parent || !((val == 0) || (val == 1)))
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3432
  		return -EINVAL;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3433
  	memcg->oom_kill_disable = val;
4d845ebf4   KAMEZAWA Hiroyuki   memcg: fix wake u...
3434
  	if (!val)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3435
  		memcg_oom_recover(memcg);
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3436

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3437
3438
  	return 0;
  }
52ebea749   Tejun Heo   writeback: make b...
3439
3440
3441
3442
3443
3444
  #ifdef CONFIG_CGROUP_WRITEBACK
  
  struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
  {
  	return &memcg->cgwb_list;
  }
841710aa6   Tejun Heo   writeback: implem...
3445
3446
3447
3448
3449
3450
3451
3452
3453
  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
  {
  	return wb_domain_init(&memcg->cgwb_domain, gfp);
  }
  
  static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
  {
  	wb_domain_exit(&memcg->cgwb_domain);
  }
2529bb3aa   Tejun Heo   writeback: reset ...
3454
3455
3456
3457
  static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
  {
  	wb_domain_size_changed(&memcg->cgwb_domain);
  }
841710aa6   Tejun Heo   writeback: implem...
3458
3459
3460
3461
3462
3463
3464
3465
3466
  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
  
  	if (!memcg->css.parent)
  		return NULL;
  
  	return &memcg->cgwb_domain;
  }
c2aa723a6   Tejun Heo   writeback: implem...
3467
3468
3469
  /**
   * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
   * @wb: bdi_writeback in question
c5edf9cdc   Tejun Heo   writeback: fix in...
3470
3471
   * @pfilepages: out parameter for number of file pages
   * @pheadroom: out parameter for number of allocatable pages according to memcg
c2aa723a6   Tejun Heo   writeback: implem...
3472
3473
3474
   * @pdirty: out parameter for number of dirty pages
   * @pwriteback: out parameter for number of pages under writeback
   *
c5edf9cdc   Tejun Heo   writeback: fix in...
3475
3476
3477
   * Determine the numbers of file, headroom, dirty, and writeback pages in
   * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
   * is a bit more involved.
c2aa723a6   Tejun Heo   writeback: implem...
3478
   *
c5edf9cdc   Tejun Heo   writeback: fix in...
3479
3480
3481
3482
3483
   * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
   * headroom is calculated as the lowest headroom of itself and the
   * ancestors.  Note that this doesn't consider the actual amount of
   * available memory in the system.  The caller should further cap
   * *@pheadroom accordingly.
c2aa723a6   Tejun Heo   writeback: implem...
3484
   */
c5edf9cdc   Tejun Heo   writeback: fix in...
3485
3486
3487
  void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
  			 unsigned long *pheadroom, unsigned long *pdirty,
  			 unsigned long *pwriteback)
c2aa723a6   Tejun Heo   writeback: implem...
3488
3489
3490
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
  	struct mem_cgroup *parent;
c2aa723a6   Tejun Heo   writeback: implem...
3491
3492
3493
3494
3495
  
  	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
  
  	/* this should eventually include NR_UNSTABLE_NFS */
  	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
c5edf9cdc   Tejun Heo   writeback: fix in...
3496
3497
3498
  	*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
  						     (1 << LRU_ACTIVE_FILE));
  	*pheadroom = PAGE_COUNTER_MAX;
c2aa723a6   Tejun Heo   writeback: implem...
3499

c2aa723a6   Tejun Heo   writeback: implem...
3500
3501
3502
  	while ((parent = parent_mem_cgroup(memcg))) {
  		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
  		unsigned long used = page_counter_read(&memcg->memory);
c5edf9cdc   Tejun Heo   writeback: fix in...
3503
  		*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
c2aa723a6   Tejun Heo   writeback: implem...
3504
3505
  		memcg = parent;
  	}
c2aa723a6   Tejun Heo   writeback: implem...
3506
  }
841710aa6   Tejun Heo   writeback: implem...
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
  #else	/* CONFIG_CGROUP_WRITEBACK */
  
  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
  {
  	return 0;
  }
  
  static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
  {
  }
2529bb3aa   Tejun Heo   writeback: reset ...
3517
3518
3519
  static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
  {
  }
52ebea749   Tejun Heo   writeback: make b...
3520
  #endif	/* CONFIG_CGROUP_WRITEBACK */
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3521
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
   * DO NOT USE IN NEW FILES.
   *
   * "cgroup.event_control" implementation.
   *
   * This is way over-engineered.  It tries to support fully configurable
   * events for each user.  Such level of flexibility is completely
   * unnecessary especially in the light of the planned unified hierarchy.
   *
   * Please deprecate this and replace with something simpler if at all
   * possible.
   */
  
  /*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3535
3536
3537
3538
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
3539
  static void memcg_event_remove(struct work_struct *work)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3540
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3541
3542
  	struct mem_cgroup_event *event =
  		container_of(work, struct mem_cgroup_event, remove);
59b6f8734   Tejun Heo   memcg: make cgrou...
3543
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3544
3545
  
  	remove_wait_queue(event->wqh, &event->wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
3546
  	event->unregister_event(memcg, event->eventfd);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3547
3548
3549
3550
3551
3552
  
  	/* Notify userspace the event is going away. */
  	eventfd_signal(event->eventfd, 1);
  
  	eventfd_ctx_put(event->eventfd);
  	kfree(event);
59b6f8734   Tejun Heo   memcg: make cgrou...
3553
  	css_put(&memcg->css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3554
3555
3556
3557
3558
3559
3560
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
3561
3562
  static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
  			    int sync, void *key)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3563
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3564
3565
  	struct mem_cgroup_event *event =
  		container_of(wait, struct mem_cgroup_event, wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
3566
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
  		/*
  		 * If the event has been detached at cgroup removal, we
  		 * can simply return knowing the other side will cleanup
  		 * for us.
  		 *
  		 * We can't race against event freeing since the other
  		 * side will require wqh->lock via remove_wait_queue(),
  		 * which we hold.
  		 */
fba948078   Tejun Heo   cgroup, memcg: mo...
3579
  		spin_lock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3580
3581
3582
3583
3584
3585
3586
3587
  		if (!list_empty(&event->list)) {
  			list_del_init(&event->list);
  			/*
  			 * We are in atomic context, but cgroup_event_remove()
  			 * may sleep, so we have to call it in workqueue.
  			 */
  			schedule_work(&event->remove);
  		}
fba948078   Tejun Heo   cgroup, memcg: mo...
3588
  		spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3589
3590
3591
3592
  	}
  
  	return 0;
  }
3bc942f37   Tejun Heo   memcg: rename cgr...
3593
  static void memcg_event_ptable_queue_proc(struct file *file,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3594
3595
  		wait_queue_head_t *wqh, poll_table *pt)
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3596
3597
  	struct mem_cgroup_event *event =
  		container_of(pt, struct mem_cgroup_event, pt);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3598
3599
3600
3601
3602
3603
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
3604
3605
   * DO NOT USE IN NEW FILES.
   *
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3606
3607
3608
3609
3610
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
451af504d   Tejun Heo   cgroup: replace c...
3611
3612
  static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
  					 char *buf, size_t nbytes, loff_t off)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3613
  {
451af504d   Tejun Heo   cgroup: replace c...
3614
  	struct cgroup_subsys_state *css = of_css(of);
fba948078   Tejun Heo   cgroup, memcg: mo...
3615
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
3616
  	struct mem_cgroup_event *event;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3617
3618
3619
3620
  	struct cgroup_subsys_state *cfile_css;
  	unsigned int efd, cfd;
  	struct fd efile;
  	struct fd cfile;
fba948078   Tejun Heo   cgroup, memcg: mo...
3621
  	const char *name;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3622
3623
  	char *endp;
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
3624
3625
3626
  	buf = strstrip(buf);
  
  	efd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3627
3628
  	if (*endp != ' ')
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
3629
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3630

451af504d   Tejun Heo   cgroup: replace c...
3631
  	cfd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3632
3633
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
3634
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3635
3636
3637
3638
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
59b6f8734   Tejun Heo   memcg: make cgrou...
3639
  	event->memcg = memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3640
  	INIT_LIST_HEAD(&event->list);
3bc942f37   Tejun Heo   memcg: rename cgr...
3641
3642
3643
  	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
  	INIT_WORK(&event->remove, memcg_event_remove);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
  
  	efile = fdget(efd);
  	if (!efile.file) {
  		ret = -EBADF;
  		goto out_kfree;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile.file);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto out_put_efile;
  	}
  
  	cfile = fdget(cfd);
  	if (!cfile.file) {
  		ret = -EBADF;
  		goto out_put_eventfd;
  	}
  
  	/* the process need read permission on control file */
  	/* AV: shouldn't we check that it's been opened for read instead? */
  	ret = inode_permission(file_inode(cfile.file), MAY_READ);
  	if (ret < 0)
  		goto out_put_cfile;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3668
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
3669
3670
3671
3672
  	 * Determine the event callbacks and set them in @event.  This used
  	 * to be done via struct cftype but cgroup core no longer knows
  	 * about these events.  The following is crude but the whole thing
  	 * is for compatibility anyway.
3bc942f37   Tejun Heo   memcg: rename cgr...
3673
3674
  	 *
  	 * DO NOT ADD NEW FILES.
fba948078   Tejun Heo   cgroup, memcg: mo...
3675
  	 */
b583043e9   Al Viro   kill f_dentry uses
3676
  	name = cfile.file->f_path.dentry->d_name.name;
fba948078   Tejun Heo   cgroup, memcg: mo...
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
  
  	if (!strcmp(name, "memory.usage_in_bytes")) {
  		event->register_event = mem_cgroup_usage_register_event;
  		event->unregister_event = mem_cgroup_usage_unregister_event;
  	} else if (!strcmp(name, "memory.oom_control")) {
  		event->register_event = mem_cgroup_oom_register_event;
  		event->unregister_event = mem_cgroup_oom_unregister_event;
  	} else if (!strcmp(name, "memory.pressure_level")) {
  		event->register_event = vmpressure_register_event;
  		event->unregister_event = vmpressure_unregister_event;
  	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
347c4a874   Tejun Heo   memcg: remove cgr...
3688
3689
  		event->register_event = memsw_cgroup_usage_register_event;
  		event->unregister_event = memsw_cgroup_usage_unregister_event;
fba948078   Tejun Heo   cgroup, memcg: mo...
3690
3691
3692
3693
3694
3695
  	} else {
  		ret = -EINVAL;
  		goto out_put_cfile;
  	}
  
  	/*
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
3696
3697
3698
  	 * Verify @cfile should belong to @css.  Also, remaining events are
  	 * automatically removed on cgroup destruction but the removal is
  	 * asynchronous, so take an extra ref on @css.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3699
  	 */
b583043e9   Al Viro   kill f_dentry uses
3700
  	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
ec903c0c8   Tejun Heo   cgroup: rename cs...
3701
  					       &memory_cgrp_subsys);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3702
  	ret = -EINVAL;
5a17f543e   Tejun Heo   cgroup: improve c...
3703
  	if (IS_ERR(cfile_css))
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3704
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
3705
3706
  	if (cfile_css != css) {
  		css_put(cfile_css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3707
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
3708
  	}
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3709

451af504d   Tejun Heo   cgroup: replace c...
3710
  	ret = event->register_event(memcg, event->eventfd, buf);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3711
3712
3713
3714
  	if (ret)
  		goto out_put_css;
  
  	efile.file->f_op->poll(efile.file, &event->pt);
fba948078   Tejun Heo   cgroup, memcg: mo...
3715
3716
3717
  	spin_lock(&memcg->event_list_lock);
  	list_add(&event->list, &memcg->event_list);
  	spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3718
3719
3720
  
  	fdput(cfile);
  	fdput(efile);
451af504d   Tejun Heo   cgroup: replace c...
3721
  	return nbytes;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3722
3723
  
  out_put_css:
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
3724
  	css_put(css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
  out_put_cfile:
  	fdput(cfile);
  out_put_eventfd:
  	eventfd_ctx_put(event->eventfd);
  out_put_efile:
  	fdput(efile);
  out_kfree:
  	kfree(event);
  
  	return ret;
  }
241994ed8   Johannes Weiner   mm: memcontrol: d...
3736
  static struct cftype mem_cgroup_legacy_files[] = {
8cdea7c05   Balbir Singh   Memory controller...
3737
  	{
0eea10301   Balbir Singh   Memory controller...
3738
  		.name = "usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3739
  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
3740
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
3741
3742
  	},
  	{
c84872e16   Pavel Emelyanov   memcgroup: add th...
3743
  		.name = "max_usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3744
  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
3745
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
3746
  		.read_u64 = mem_cgroup_read_u64,
c84872e16   Pavel Emelyanov   memcgroup: add th...
3747
3748
  	},
  	{
0eea10301   Balbir Singh   Memory controller...
3749
  		.name = "limit_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3750
  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
3751
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
3752
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
3753
3754
  	},
  	{
296c81d89   Balbir Singh   memory controller...
3755
3756
  		.name = "soft_limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
3757
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
3758
  		.read_u64 = mem_cgroup_read_u64,
296c81d89   Balbir Singh   memory controller...
3759
3760
  	},
  	{
8cdea7c05   Balbir Singh   Memory controller...
3761
  		.name = "failcnt",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3762
  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
3763
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
3764
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
3765
  	},
8697d3319   Balbir Singh   Memory controller...
3766
  	{
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3767
  		.name = "stat",
2da8ca822   Tejun Heo   cgroup: replace c...
3768
  		.seq_show = memcg_stat_show,
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3769
  	},
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3770
3771
  	{
  		.name = "force_empty",
6770c64e5   Tejun Heo   cgroup: replace c...
3772
  		.write = mem_cgroup_force_empty_write,
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3773
  	},
18f59ea7d   Balbir Singh   memcg: memory cgr...
3774
3775
3776
3777
3778
  	{
  		.name = "use_hierarchy",
  		.write_u64 = mem_cgroup_hierarchy_write,
  		.read_u64 = mem_cgroup_hierarchy_read,
  	},
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3779
  	{
3bc942f37   Tejun Heo   memcg: rename cgr...
3780
  		.name = "cgroup.event_control",		/* XXX: for compat */
451af504d   Tejun Heo   cgroup: replace c...
3781
  		.write = memcg_write_event_control,
7dbdb199d   Tejun Heo   cgroup: replace c...
3782
  		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3783
3784
  	},
  	{
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3785
3786
3787
3788
  		.name = "swappiness",
  		.read_u64 = mem_cgroup_swappiness_read,
  		.write_u64 = mem_cgroup_swappiness_write,
  	},
7dc74be03   Daisuke Nishimura   memcg: add interf...
3789
3790
3791
3792
3793
  	{
  		.name = "move_charge_at_immigrate",
  		.read_u64 = mem_cgroup_move_charge_read,
  		.write_u64 = mem_cgroup_move_charge_write,
  	},
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3794
3795
  	{
  		.name = "oom_control",
2da8ca822   Tejun Heo   cgroup: replace c...
3796
  		.seq_show = mem_cgroup_oom_control_read,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3797
  		.write_u64 = mem_cgroup_oom_control_write,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3798
3799
  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  	},
70ddf637e   Anton Vorontsov   memcg: add memory...
3800
3801
  	{
  		.name = "pressure_level",
70ddf637e   Anton Vorontsov   memcg: add memory...
3802
  	},
406eb0c9b   Ying Han   memcg: add memory...
3803
3804
3805
  #ifdef CONFIG_NUMA
  	{
  		.name = "numa_stat",
2da8ca822   Tejun Heo   cgroup: replace c...
3806
  		.seq_show = memcg_numa_stat_show,
406eb0c9b   Ying Han   memcg: add memory...
3807
3808
  	},
  #endif
510fc4e11   Glauber Costa   memcg: kmem accou...
3809
3810
3811
  	{
  		.name = "kmem.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
3812
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
3813
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
3814
3815
3816
3817
  	},
  	{
  		.name = "kmem.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
3818
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
3819
3820
3821
3822
  	},
  	{
  		.name = "kmem.failcnt",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
3823
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
3824
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
3825
3826
3827
3828
  	},
  	{
  		.name = "kmem.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
3829
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
3830
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
3831
  	},
749c54151   Glauber Costa   memcg: aggregate ...
3832
3833
3834
  #ifdef CONFIG_SLABINFO
  	{
  		.name = "kmem.slabinfo",
b047501cd   Vladimir Davydov   memcg: use generi...
3835
3836
3837
3838
  		.seq_start = slab_start,
  		.seq_next = slab_next,
  		.seq_stop = slab_stop,
  		.seq_show = memcg_slab_show,
749c54151   Glauber Costa   memcg: aggregate ...
3839
3840
  	},
  #endif
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
  	{
  		.name = "kmem.tcp.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
  		.write = mem_cgroup_write,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "kmem.tcp.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "kmem.tcp.failcnt",
  		.private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "kmem.tcp.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
6bc103498   Tejun Heo   cgroup: convert m...
3864
  	{ },	/* terminate */
af36f906c   Tejun Heo   memcg: always cre...
3865
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3866

c0ff4b854   Raghavendra K T   memcg: rename mem...
3867
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3868
3869
  {
  	struct mem_cgroup_per_node *pn;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3870
  	struct mem_cgroup_per_zone *mz;
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
3871
  	int zone, tmp = node;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3872
3873
3874
3875
3876
3877
3878
3879
  	/*
  	 * This routine is called against possible nodes.
  	 * But it's BUG to call kmalloc() against offline node.
  	 *
  	 * TODO: this routine can waste much memory for nodes which will
  	 *       never be onlined. It's better to use memory hotplug callback
  	 *       function.
  	 */
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
3880
3881
  	if (!node_state(node, N_NORMAL_MEMORY))
  		tmp = -1;
17295c88a   Jesper Juhl   memcg: use [kv]za...
3882
  	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3883
3884
  	if (!pn)
  		return 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3885

1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3886
3887
  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  		mz = &pn->zoneinfo[zone];
bea8c150a   Hugh Dickins   memcg: fix hotplu...
3888
  		lruvec_init(&mz->lruvec);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
3889
3890
  		mz->usage_in_excess = 0;
  		mz->on_tree = false;
d79154bb5   Hugh Dickins   memcg: replace me...
3891
  		mz->memcg = memcg;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3892
  	}
54f72fe02   Johannes Weiner   memcg: clean up m...
3893
  	memcg->nodeinfo[node] = pn;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3894
3895
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3896
  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3897
  {
54f72fe02   Johannes Weiner   memcg: clean up m...
3898
  	kfree(memcg->nodeinfo[node]);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
3899
  }
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3900
  static void mem_cgroup_free(struct mem_cgroup *memcg)
59927fb98   Hugh Dickins   memcg: free mem_c...
3901
  {
c8b2a36fb   Glauber Costa   memcg: execute th...
3902
  	int node;
59927fb98   Hugh Dickins   memcg: free mem_c...
3903

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3904
  	memcg_wb_domain_exit(memcg);
c8b2a36fb   Glauber Costa   memcg: execute th...
3905
3906
  	for_each_node(node)
  		free_mem_cgroup_per_zone_info(memcg, node);
c8b2a36fb   Glauber Costa   memcg: execute th...
3907
  	free_percpu(memcg->stat);
8ff69e2c8   Vladimir Davydov   memcg: do not use...
3908
  	kfree(memcg);
59927fb98   Hugh Dickins   memcg: free mem_c...
3909
  }
3afe36b1f   Glauber Costa   memcg: always fre...
3910

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3911
  static struct mem_cgroup *mem_cgroup_alloc(void)
8cdea7c05   Balbir Singh   Memory controller...
3912
  {
d142e3e66   Glauber Costa   memcg: split part...
3913
  	struct mem_cgroup *memcg;
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3914
  	size_t size;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
3915
  	int node;
8cdea7c05   Balbir Singh   Memory controller...
3916

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3917
3918
3919
3920
  	size = sizeof(struct mem_cgroup);
  	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
  
  	memcg = kzalloc(size, GFP_KERNEL);
c0ff4b854   Raghavendra K T   memcg: rename mem...
3921
  	if (!memcg)
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3922
3923
3924
3925
3926
  		return NULL;
  
  	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
  	if (!memcg->stat)
  		goto fail;
78fb74669   Pavel Emelianov   Memory controller...
3927

3ed28fa10   Bob Liu   memcg: cleanup fo...
3928
  	for_each_node(node)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3929
  		if (alloc_mem_cgroup_per_zone_info(memcg, node))
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3930
  			goto fail;
f64c3f549   Balbir Singh   memory controller...
3931

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3932
3933
  	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
  		goto fail;
28dbc4b6a   Balbir Singh   memcg: memory cgr...
3934

f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
3935
  	INIT_WORK(&memcg->high_work, high_work_func);
d142e3e66   Glauber Costa   memcg: split part...
3936
3937
  	memcg->last_scanned_node = MAX_NUMNODES;
  	INIT_LIST_HEAD(&memcg->oom_notify);
d142e3e66   Glauber Costa   memcg: split part...
3938
3939
  	mutex_init(&memcg->thresholds_lock);
  	spin_lock_init(&memcg->move_lock);
70ddf637e   Anton Vorontsov   memcg: add memory...
3940
  	vmpressure_init(&memcg->vmpressure);
fba948078   Tejun Heo   cgroup, memcg: mo...
3941
3942
  	INIT_LIST_HEAD(&memcg->event_list);
  	spin_lock_init(&memcg->event_list_lock);
d886f4e48   Johannes Weiner   mm: memcontrol: r...
3943
  	memcg->socket_pressure = jiffies;
127424c86   Johannes Weiner   mm: memcontrol: m...
3944
  #ifndef CONFIG_SLOB
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3945
  	memcg->kmemcg_id = -1;
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3946
  #endif
52ebea749   Tejun Heo   writeback: make b...
3947
3948
3949
  #ifdef CONFIG_CGROUP_WRITEBACK
  	INIT_LIST_HEAD(&memcg->cgwb_list);
  #endif
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3950
3951
3952
3953
  	return memcg;
  fail:
  	mem_cgroup_free(memcg);
  	return NULL;
d142e3e66   Glauber Costa   memcg: split part...
3954
  }
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3955
3956
  static struct cgroup_subsys_state * __ref
  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
d142e3e66   Glauber Costa   memcg: split part...
3957
  {
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3958
3959
3960
  	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
  	struct mem_cgroup *memcg;
  	long error = -ENOMEM;
d142e3e66   Glauber Costa   memcg: split part...
3961

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3962
3963
3964
  	memcg = mem_cgroup_alloc();
  	if (!memcg)
  		return ERR_PTR(error);
d142e3e66   Glauber Costa   memcg: split part...
3965

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3966
3967
3968
3969
3970
3971
3972
3973
  	memcg->high = PAGE_COUNTER_MAX;
  	memcg->soft_limit = PAGE_COUNTER_MAX;
  	if (parent) {
  		memcg->swappiness = mem_cgroup_swappiness(parent);
  		memcg->oom_kill_disable = parent->oom_kill_disable;
  	}
  	if (parent && parent->use_hierarchy) {
  		memcg->use_hierarchy = true;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3974
  		page_counter_init(&memcg->memory, &parent->memory);
37e843511   Vladimir Davydov   mm: memcontrol: c...
3975
  		page_counter_init(&memcg->swap, &parent->swap);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3976
3977
  		page_counter_init(&memcg->memsw, &parent->memsw);
  		page_counter_init(&memcg->kmem, &parent->kmem);
0db152981   Johannes Weiner   mm: memcontrol: f...
3978
  		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
18f59ea7d   Balbir Singh   memcg: memory cgr...
3979
  	} else {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3980
  		page_counter_init(&memcg->memory, NULL);
37e843511   Vladimir Davydov   mm: memcontrol: c...
3981
  		page_counter_init(&memcg->swap, NULL);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3982
3983
  		page_counter_init(&memcg->memsw, NULL);
  		page_counter_init(&memcg->kmem, NULL);
0db152981   Johannes Weiner   mm: memcontrol: f...
3984
  		page_counter_init(&memcg->tcpmem, NULL);
8c7f6edbd   Tejun Heo   cgroup: mark subs...
3985
3986
3987
3988
3989
  		/*
  		 * Deeper hierachy with use_hierarchy == false doesn't make
  		 * much sense so let cgroup subsystem know about this
  		 * unfortunate state in our controller.
  		 */
d142e3e66   Glauber Costa   memcg: split part...
3990
  		if (parent != root_mem_cgroup)
073219e99   Tejun Heo   cgroup: clean up ...
3991
  			memory_cgrp_subsys.broken_hierarchy = true;
18f59ea7d   Balbir Singh   memcg: memory cgr...
3992
  	}
d64416377   Vladimir Davydov   memcg: rework mem...
3993

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
3994
3995
3996
3997
3998
  	/* The following stuff does not apply to the root */
  	if (!parent) {
  		root_mem_cgroup = memcg;
  		return &memcg->css;
  	}
b313aeee2   Vladimir Davydov   mm: memcontrol: e...
3999
  	error = memcg_online_kmem(memcg);
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
4000
4001
  	if (error)
  		goto fail;
127424c86   Johannes Weiner   mm: memcontrol: m...
4002

f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
4003
  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
ef12947c9   Johannes Weiner   mm: memcontrol: s...
4004
  		static_branch_inc(&memcg_sockets_enabled_key);
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
4005

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
  	return &memcg->css;
  fail:
  	mem_cgroup_free(memcg);
  	return NULL;
  }
  
  static int
  mem_cgroup_css_online(struct cgroup_subsys_state *css)
  {
  	if (css->id > MEM_CGROUP_ID_MAX)
  		return -ENOSPC;
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
4017
4018
  
  	return 0;
8cdea7c05   Balbir Singh   Memory controller...
4019
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4020
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4021
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4022
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
4023
  	struct mem_cgroup_event *event, *tmp;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4024
4025
4026
4027
4028
4029
  
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace.
  	 */
fba948078   Tejun Heo   cgroup, memcg: mo...
4030
4031
  	spin_lock(&memcg->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4032
4033
4034
  		list_del_init(&event->list);
  		schedule_work(&event->remove);
  	}
fba948078   Tejun Heo   cgroup, memcg: mo...
4035
  	spin_unlock(&memcg->event_list_lock);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4036

567e9ab2e   Johannes Weiner   mm: memcontrol: g...
4037
  	memcg_offline_kmem(memcg);
52ebea749   Tejun Heo   writeback: make b...
4038
  	wb_memcg_offline(memcg);
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4039
  }
6df38689e   Vladimir Davydov   mm: memcontrol: f...
4040
4041
4042
4043
4044
4045
  static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
  	invalidate_reclaim_iterators(memcg);
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4046
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
8cdea7c05   Balbir Singh   Memory controller...
4047
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4048
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
c268e9946   Daisuke Nishimura   memcg: fix hierar...
4049

f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
4050
  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
ef12947c9   Johannes Weiner   mm: memcontrol: s...
4051
  		static_branch_dec(&memcg_sockets_enabled_key);
127424c86   Johannes Weiner   mm: memcontrol: m...
4052

0db152981   Johannes Weiner   mm: memcontrol: f...
4053
  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
d55f90bfa   Vladimir Davydov   net: drop tcp_mem...
4054
  		static_branch_dec(&memcg_sockets_enabled_key);
3893e302f   Johannes Weiner   mm: memcontrol: s...
4055

0b8f73e10   Johannes Weiner   mm: memcontrol: c...
4056
4057
4058
  	vmpressure_cleanup(&memcg->vmpressure);
  	cancel_work_sync(&memcg->high_work);
  	mem_cgroup_remove_from_trees(memcg);
d886f4e48   Johannes Weiner   mm: memcontrol: r...
4059
  	memcg_free_kmem(memcg);
0b8f73e10   Johannes Weiner   mm: memcontrol: c...
4060
  	mem_cgroup_free(memcg);
8cdea7c05   Balbir Singh   Memory controller...
4061
  }
1ced953b1   Tejun Heo   blkcg, memcg: mak...
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
  /**
   * mem_cgroup_css_reset - reset the states of a mem_cgroup
   * @css: the target css
   *
   * Reset the states of the mem_cgroup associated with @css.  This is
   * invoked when the userland requests disabling on the default hierarchy
   * but the memcg is pinned through dependency.  The memcg should stop
   * applying policies and should revert to the vanilla state as it may be
   * made visible again.
   *
   * The current implementation only resets the essential configurations.
   * This needs to be expanded to cover all the visible parts.
   */
  static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
d334c9bcb   Vladimir Davydov   mm: memcontrol: c...
4078
4079
4080
4081
4082
  	page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
  	page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
  	page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
  	page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
  	page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4083
4084
  	memcg->low = 0;
  	memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4085
  	memcg->soft_limit = PAGE_COUNTER_MAX;
2529bb3aa   Tejun Heo   writeback: reset ...
4086
  	memcg_wb_domain_size_changed(memcg);
1ced953b1   Tejun Heo   blkcg, memcg: mak...
4087
  }
024914477   Daisuke Nishimura   memcg: move charg...
4088
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
4089
  /* Handlers for move charge at task migration. */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4090
  static int mem_cgroup_do_precharge(unsigned long count)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4091
  {
05b843012   Johannes Weiner   mm: memcontrol: u...
4092
  	int ret;
9476db974   Johannes Weiner   mm: memcontrol: s...
4093

d0164adc8   Mel Gorman   mm, page_alloc: d...
4094
4095
  	/* Try a single bulk charge without reclaim first, kswapd may wake */
  	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
9476db974   Johannes Weiner   mm: memcontrol: s...
4096
  	if (!ret) {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4097
  		mc.precharge += count;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4098
4099
  		return ret;
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
4100
4101
  
  	/* Try charges one by one with reclaim */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4102
  	while (count--) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4103
  		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4104
  		if (ret)
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4105
  			return ret;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4106
  		mc.precharge++;
9476db974   Johannes Weiner   mm: memcontrol: s...
4107
  		cond_resched();
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4108
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
4109
  	return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4110
4111
4112
  }
  
  /**
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4113
   * get_mctgt_type - get target type of moving charge
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4114
4115
4116
   * @vma: the vma the pte to be checked belongs
   * @addr: the address corresponding to the pte to be checked
   * @ptent: the pte to be checked
024914477   Daisuke Nishimura   memcg: move charg...
4117
   * @target: the pointer the target page or swap ent will be stored(can be NULL)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4118
4119
4120
4121
4122
4123
   *
   * Returns
   *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
   *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
   *     move charge. if @target is not NULL, the page is stored in target->page
   *     with extra refcnt got(Callers should handle it).
024914477   Daisuke Nishimura   memcg: move charg...
4124
4125
4126
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4127
4128
4129
   *
   * Called with pte lock held.
   */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4130
4131
  union mc_target {
  	struct page	*page;
024914477   Daisuke Nishimura   memcg: move charg...
4132
  	swp_entry_t	ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4133
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4134
  enum mc_target_type {
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4135
  	MC_TARGET_NONE = 0,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4136
  	MC_TARGET_PAGE,
024914477   Daisuke Nishimura   memcg: move charg...
4137
  	MC_TARGET_SWAP,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4138
  };
90254a658   Daisuke Nishimura   memcg: clean up m...
4139
4140
  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
  						unsigned long addr, pte_t ptent)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4141
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4142
  	struct page *page = vm_normal_page(vma, addr, ptent);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4143

90254a658   Daisuke Nishimura   memcg: clean up m...
4144
4145
4146
  	if (!page || !page_mapped(page))
  		return NULL;
  	if (PageAnon(page)) {
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4147
  		if (!(mc.flags & MOVE_ANON))
90254a658   Daisuke Nishimura   memcg: clean up m...
4148
  			return NULL;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4149
4150
4151
4152
  	} else {
  		if (!(mc.flags & MOVE_FILE))
  			return NULL;
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4153
4154
4155
4156
4157
  	if (!get_page_unless_zero(page))
  		return NULL;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4158
  #ifdef CONFIG_SWAP
90254a658   Daisuke Nishimura   memcg: clean up m...
4159
4160
4161
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4162
4163
  	struct page *page = NULL;
  	swp_entry_t ent = pte_to_swp_entry(ptent);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4164
  	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
90254a658   Daisuke Nishimura   memcg: clean up m...
4165
  		return NULL;
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4166
4167
4168
4169
  	/*
  	 * Because lookup_swap_cache() updates some statistics counter,
  	 * we call find_get_page() with swapper_space directly.
  	 */
33806f06d   Shaohua Li   swap: make each s...
4170
  	page = find_get_page(swap_address_space(ent), ent.val);
7941d2145   Johannes Weiner   mm: memcontrol: d...
4171
  	if (do_memsw_account())
90254a658   Daisuke Nishimura   memcg: clean up m...
4172
4173
4174
4175
  		entry->val = ent.val;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4176
4177
4178
4179
4180
4181
4182
  #else
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	return NULL;
  }
  #endif
90254a658   Daisuke Nishimura   memcg: clean up m...
4183

87946a722   Daisuke Nishimura   memcg: move charg...
4184
4185
4186
4187
  static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	struct page *page = NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4188
4189
4190
4191
4192
  	struct address_space *mapping;
  	pgoff_t pgoff;
  
  	if (!vma->vm_file) /* anonymous vma */
  		return NULL;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4193
  	if (!(mc.flags & MOVE_FILE))
87946a722   Daisuke Nishimura   memcg: move charg...
4194
  		return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4195
  	mapping = vma->vm_file->f_mapping;
0661a3361   Kirill A. Shutemov   mm: remove rest u...
4196
  	pgoff = linear_page_index(vma, addr);
87946a722   Daisuke Nishimura   memcg: move charg...
4197
4198
  
  	/* page is moved even if it's not RSS of this task(page-faulted). */
aa3b18955   Hugh Dickins   tmpfs: convert me...
4199
4200
  #ifdef CONFIG_SWAP
  	/* shmem/tmpfs may report page out on swap: account for that too. */
139b6a6fb   Johannes Weiner   mm: filemap: upda...
4201
4202
4203
4204
  	if (shmem_mapping(mapping)) {
  		page = find_get_entry(mapping, pgoff);
  		if (radix_tree_exceptional_entry(page)) {
  			swp_entry_t swp = radix_to_swp_entry(page);
7941d2145   Johannes Weiner   mm: memcontrol: d...
4205
  			if (do_memsw_account())
139b6a6fb   Johannes Weiner   mm: filemap: upda...
4206
4207
4208
4209
4210
4211
4212
  				*entry = swp;
  			page = find_get_page(swap_address_space(swp), swp.val);
  		}
  	} else
  		page = find_get_page(mapping, pgoff);
  #else
  	page = find_get_page(mapping, pgoff);
aa3b18955   Hugh Dickins   tmpfs: convert me...
4213
  #endif
87946a722   Daisuke Nishimura   memcg: move charg...
4214
4215
  	return page;
  }
b1b0deabb   Chen Gang   mm: memcontrol: l...
4216
4217
4218
4219
4220
4221
4222
  /**
   * mem_cgroup_move_account - move account of the page
   * @page: the page
   * @nr_pages: number of regular pages (>1 for huge pages)
   * @from: mem_cgroup which the page is moved from.
   * @to:	mem_cgroup which the page is moved to. @from != @to.
   *
3ac808fdd   Kirill A. Shutemov   mm, thp: remove c...
4223
   * The caller must make sure the page is not on LRU (isolate_page() is useful.)
b1b0deabb   Chen Gang   mm: memcontrol: l...
4224
4225
4226
4227
4228
   *
   * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
   * from old cgroup.
   */
  static int mem_cgroup_move_account(struct page *page,
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4229
  				   bool compound,
b1b0deabb   Chen Gang   mm: memcontrol: l...
4230
4231
4232
4233
  				   struct mem_cgroup *from,
  				   struct mem_cgroup *to)
  {
  	unsigned long flags;
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4234
  	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
b1b0deabb   Chen Gang   mm: memcontrol: l...
4235
  	int ret;
c4843a759   Greg Thelen   memcg: add per cg...
4236
  	bool anon;
b1b0deabb   Chen Gang   mm: memcontrol: l...
4237
4238
4239
  
  	VM_BUG_ON(from == to);
  	VM_BUG_ON_PAGE(PageLRU(page), page);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4240
  	VM_BUG_ON(compound && !PageTransHuge(page));
b1b0deabb   Chen Gang   mm: memcontrol: l...
4241
4242
  
  	/*
6a93ca8fd   Johannes Weiner   mm: migrate: do n...
4243
  	 * Prevent mem_cgroup_migrate() from looking at
45637bab3   Hugh Dickins   mm: rename mem_cg...
4244
  	 * page->mem_cgroup of its source page while we change it.
b1b0deabb   Chen Gang   mm: memcontrol: l...
4245
  	 */
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4246
  	ret = -EBUSY;
b1b0deabb   Chen Gang   mm: memcontrol: l...
4247
4248
4249
4250
4251
4252
  	if (!trylock_page(page))
  		goto out;
  
  	ret = -EINVAL;
  	if (page->mem_cgroup != from)
  		goto out_unlock;
c4843a759   Greg Thelen   memcg: add per cg...
4253
  	anon = PageAnon(page);
b1b0deabb   Chen Gang   mm: memcontrol: l...
4254
  	spin_lock_irqsave(&from->move_lock, flags);
c4843a759   Greg Thelen   memcg: add per cg...
4255
  	if (!anon && page_mapped(page)) {
b1b0deabb   Chen Gang   mm: memcontrol: l...
4256
4257
4258
4259
4260
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  	}
c4843a759   Greg Thelen   memcg: add per cg...
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
  	/*
  	 * move_lock grabbed above and caller set from->moving_account, so
  	 * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
  	 * So mapping should be stable for dirty pages.
  	 */
  	if (!anon && PageDirty(page)) {
  		struct address_space *mapping = page_mapping(page);
  
  		if (mapping_cap_account_dirty(mapping)) {
  			__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
  				       nr_pages);
  			__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
  				       nr_pages);
  		}
  	}
b1b0deabb   Chen Gang   mm: memcontrol: l...
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
  	if (PageWriteback(page)) {
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  	}
  
  	/*
  	 * It is safe to change page->mem_cgroup here because the page
  	 * is referenced, charged, and isolated - we can't race with
  	 * uncharging, charging, migration, or LRU putback.
  	 */
  
  	/* caller should have done css_get */
  	page->mem_cgroup = to;
  	spin_unlock_irqrestore(&from->move_lock, flags);
  
  	ret = 0;
  
  	local_irq_disable();
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4296
  	mem_cgroup_charge_statistics(to, page, compound, nr_pages);
b1b0deabb   Chen Gang   mm: memcontrol: l...
4297
  	memcg_check_events(to, page);
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4298
  	mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
b1b0deabb   Chen Gang   mm: memcontrol: l...
4299
4300
4301
4302
4303
4304
4305
  	memcg_check_events(from, page);
  	local_irq_enable();
  out_unlock:
  	unlock_page(page);
  out:
  	return ret;
  }
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4306
  static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
90254a658   Daisuke Nishimura   memcg: clean up m...
4307
4308
4309
  		unsigned long addr, pte_t ptent, union mc_target *target)
  {
  	struct page *page = NULL;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4310
  	enum mc_target_type ret = MC_TARGET_NONE;
90254a658   Daisuke Nishimura   memcg: clean up m...
4311
4312
4313
4314
4315
4316
  	swp_entry_t ent = { .val = 0 };
  
  	if (pte_present(ptent))
  		page = mc_handle_present_pte(vma, addr, ptent);
  	else if (is_swap_pte(ptent))
  		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
4317
  	else if (pte_none(ptent))
87946a722   Daisuke Nishimura   memcg: move charg...
4318
  		page = mc_handle_file_pte(vma, addr, ptent, &ent);
90254a658   Daisuke Nishimura   memcg: clean up m...
4319
4320
  
  	if (!page && !ent.val)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4321
  		return ret;
024914477   Daisuke Nishimura   memcg: move charg...
4322
  	if (page) {
024914477   Daisuke Nishimura   memcg: move charg...
4323
  		/*
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
4324
  		 * Do only loose check w/o serialization.
1306a85ae   Johannes Weiner   mm: embed the mem...
4325
  		 * mem_cgroup_move_account() checks the page is valid or
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
4326
  		 * not under LRU exclusion.
024914477   Daisuke Nishimura   memcg: move charg...
4327
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
4328
  		if (page->mem_cgroup == mc.from) {
024914477   Daisuke Nishimura   memcg: move charg...
4329
4330
4331
4332
4333
4334
4335
  			ret = MC_TARGET_PAGE;
  			if (target)
  				target->page = page;
  		}
  		if (!ret || !target)
  			put_page(page);
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4336
4337
  	/* There is a swap entry and a page doesn't exist or isn't charged */
  	if (ent.val && !ret &&
34c00c319   Li Zefan   memcg: convert to...
4338
  	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4339
4340
4341
  		ret = MC_TARGET_SWAP;
  		if (target)
  			target->ent = ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4342
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4343
4344
  	return ret;
  }
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * We don't consider swapping or file mapped pages because THP does not
   * support them for now.
   * Caller should make sure that pmd_trans_huge(pmd) is true.
   */
  static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	struct page *page = NULL;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4355
4356
4357
  	enum mc_target_type ret = MC_TARGET_NONE;
  
  	page = pmd_page(pmd);
309381fea   Sasha Levin   mm: dump page whe...
4358
  	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4359
  	if (!(mc.flags & MOVE_ANON))
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4360
  		return ret;
1306a85ae   Johannes Weiner   mm: embed the mem...
4361
  	if (page->mem_cgroup == mc.from) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
  		ret = MC_TARGET_PAGE;
  		if (target) {
  			get_page(page);
  			target->page = page;
  		}
  	}
  	return ret;
  }
  #else
  static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	return MC_TARGET_NONE;
  }
  #endif
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4377
4378
4379
4380
  static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  					unsigned long addr, unsigned long end,
  					struct mm_walk *walk)
  {
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4381
  	struct vm_area_struct *vma = walk->vma;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4382
4383
  	pte_t *pte;
  	spinlock_t *ptl;
b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
4384
4385
  	ptl = pmd_trans_huge_lock(pmd, vma);
  	if (ptl) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4386
4387
  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
  			mc.precharge += HPAGE_PMD_NR;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4388
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
4389
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4390
  	}
033193275   Dave Hansen   pagewalk: only sp...
4391

45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
4392
4393
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4394
4395
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; pte++, addr += PAGE_SIZE)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4396
  		if (get_mctgt_type(vma, addr, *pte, NULL))
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4397
4398
4399
  			mc.precharge++;	/* increment precharge temporarily */
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4400
4401
  	return 0;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4402
4403
4404
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
  	unsigned long precharge;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4405

26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4406
4407
4408
4409
  	struct mm_walk mem_cgroup_count_precharge_walk = {
  		.pmd_entry = mem_cgroup_count_precharge_pte_range,
  		.mm = mm,
  	};
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4410
  	down_read(&mm->mmap_sem);
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4411
  	walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4412
  	up_read(&mm->mmap_sem);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4413
4414
4415
4416
4417
4418
  
  	precharge = mc.precharge;
  	mc.precharge = 0;
  
  	return precharge;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4419
4420
  static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4421
4422
4423
4424
4425
  	unsigned long precharge = mem_cgroup_count_precharge(mm);
  
  	VM_BUG_ON(mc.moving_task);
  	mc.moving_task = current;
  	return mem_cgroup_do_precharge(precharge);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4426
  }
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4427
4428
  /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
  static void __mem_cgroup_clear_mc(void)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4429
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4430
4431
  	struct mem_cgroup *from = mc.from;
  	struct mem_cgroup *to = mc.to;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4432
  	/* we must uncharge all the leftover precharges from mc.to */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4433
  	if (mc.precharge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4434
  		cancel_charge(mc.to, mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4435
4436
4437
4438
4439
4440
4441
  		mc.precharge = 0;
  	}
  	/*
  	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
  	 * we must uncharge here.
  	 */
  	if (mc.moved_charge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4442
  		cancel_charge(mc.from, mc.moved_charge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4443
  		mc.moved_charge = 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4444
  	}
483c30b51   Daisuke Nishimura   memcg: improve pe...
4445
4446
  	/* we must fixup refcnts and charges */
  	if (mc.moved_swap) {
483c30b51   Daisuke Nishimura   memcg: improve pe...
4447
  		/* uncharge swap account from the old cgroup */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4448
  		if (!mem_cgroup_is_root(mc.from))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4449
  			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
483c30b51   Daisuke Nishimura   memcg: improve pe...
4450

05b843012   Johannes Weiner   mm: memcontrol: u...
4451
  		/*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4452
4453
  		 * we charged both to->memory and to->memsw, so we
  		 * should uncharge to->memory.
05b843012   Johannes Weiner   mm: memcontrol: u...
4454
  		 */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4455
  		if (!mem_cgroup_is_root(mc.to))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4456
  			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
4457
  		css_put_many(&mc.from->css, mc.moved_swap);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4458

4050377b5   Li Zefan   memcg: use css_ge...
4459
  		/* we've already done css_get(mc.to) */
483c30b51   Daisuke Nishimura   memcg: improve pe...
4460
4461
  		mc.moved_swap = 0;
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4462
4463
4464
4465
4466
4467
4468
  	memcg_oom_recover(from);
  	memcg_oom_recover(to);
  	wake_up_all(&mc.waitq);
  }
  
  static void mem_cgroup_clear_mc(void)
  {
264a0ae16   Tejun Heo   memcg: relocate c...
4469
  	struct mm_struct *mm = mc.mm;
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4470
4471
4472
4473
4474
4475
  	/*
  	 * we must clear moving_task before waking up waiters at the end of
  	 * task migration.
  	 */
  	mc.moving_task = NULL;
  	__mem_cgroup_clear_mc();
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4476
  	spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4477
4478
  	mc.from = NULL;
  	mc.to = NULL;
264a0ae16   Tejun Heo   memcg: relocate c...
4479
  	mc.mm = NULL;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4480
  	spin_unlock(&mc.lock);
264a0ae16   Tejun Heo   memcg: relocate c...
4481
4482
  
  	mmput(mm);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4483
  }
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4484
  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4485
  {
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4486
  	struct cgroup_subsys_state *css;
eed67d75b   Ross Zwisler   cgroup: Fix unini...
4487
  	struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
9f2115f93   Tejun Heo   memcg: restructur...
4488
  	struct mem_cgroup *from;
4530eddb5   Tejun Heo   cgroup, memcg, cp...
4489
  	struct task_struct *leader, *p;
9f2115f93   Tejun Heo   memcg: restructur...
4490
  	struct mm_struct *mm;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4491
  	unsigned long move_flags;
9f2115f93   Tejun Heo   memcg: restructur...
4492
  	int ret = 0;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4493

1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4494
4495
  	/* charge immigration isn't supported on the default hierarchy */
  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
9f2115f93   Tejun Heo   memcg: restructur...
4496
  		return 0;
4530eddb5   Tejun Heo   cgroup, memcg, cp...
4497
4498
4499
4500
4501
4502
4503
  	/*
  	 * Multi-process migrations only happen on the default hierarchy
  	 * where charge immigration is not used.  Perform charge
  	 * immigration if @tset contains a leader and whine if there are
  	 * multiple.
  	 */
  	p = NULL;
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4504
  	cgroup_taskset_for_each_leader(leader, css, tset) {
4530eddb5   Tejun Heo   cgroup, memcg, cp...
4505
4506
  		WARN_ON_ONCE(p);
  		p = leader;
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4507
  		memcg = mem_cgroup_from_css(css);
4530eddb5   Tejun Heo   cgroup, memcg, cp...
4508
4509
4510
  	}
  	if (!p)
  		return 0;
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4511
4512
4513
4514
4515
4516
4517
4518
  	/*
  	 * We are now commited to this value whatever it is. Changes in this
  	 * tunable will only affect upcoming migrations, not the current one.
  	 * So we need to save it, and keep it going.
  	 */
  	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
  	if (!move_flags)
  		return 0;
9f2115f93   Tejun Heo   memcg: restructur...
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
  	from = mem_cgroup_from_task(p);
  
  	VM_BUG_ON(from == memcg);
  
  	mm = get_task_mm(p);
  	if (!mm)
  		return 0;
  	/* We move charges only when we move a owner of the mm */
  	if (mm->owner == p) {
  		VM_BUG_ON(mc.from);
  		VM_BUG_ON(mc.to);
  		VM_BUG_ON(mc.precharge);
  		VM_BUG_ON(mc.moved_charge);
  		VM_BUG_ON(mc.moved_swap);
  
  		spin_lock(&mc.lock);
264a0ae16   Tejun Heo   memcg: relocate c...
4535
  		mc.mm = mm;
9f2115f93   Tejun Heo   memcg: restructur...
4536
4537
4538
4539
4540
4541
4542
4543
4544
  		mc.from = from;
  		mc.to = memcg;
  		mc.flags = move_flags;
  		spin_unlock(&mc.lock);
  		/* We set mc.moving_task later */
  
  		ret = mem_cgroup_precharge_mc(mm);
  		if (ret)
  			mem_cgroup_clear_mc();
264a0ae16   Tejun Heo   memcg: relocate c...
4545
4546
  	} else {
  		mmput(mm);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4547
4548
4549
  	}
  	return ret;
  }
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4550
  static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4551
  {
4e2f245d3   Johannes Weiner   mm: memcontrol: d...
4552
4553
  	if (mc.to)
  		mem_cgroup_clear_mc();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4554
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4555
4556
4557
  static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4558
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4559
  	int ret = 0;
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4560
  	struct vm_area_struct *vma = walk->vma;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4561
4562
  	pte_t *pte;
  	spinlock_t *ptl;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4563
4564
4565
  	enum mc_target_type target_type;
  	union mc_target target;
  	struct page *page;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4566

b6ec57f4b   Kirill A. Shutemov   thp: change pmd_t...
4567
4568
  	ptl = pmd_trans_huge_lock(pmd, vma);
  	if (ptl) {
62ade86ab   Hugh Dickins   memcg,thp: fix re...
4569
  		if (mc.precharge < HPAGE_PMD_NR) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4570
  			spin_unlock(ptl);
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4571
4572
4573
4574
4575
4576
  			return 0;
  		}
  		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
  		if (target_type == MC_TARGET_PAGE) {
  			page = target.page;
  			if (!isolate_lru_page(page)) {
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4577
  				if (!mem_cgroup_move_account(page, true,
1306a85ae   Johannes Weiner   mm: embed the mem...
4578
  							     mc.from, mc.to)) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4579
4580
4581
4582
4583
4584
4585
  					mc.precharge -= HPAGE_PMD_NR;
  					mc.moved_charge += HPAGE_PMD_NR;
  				}
  				putback_lru_page(page);
  			}
  			put_page(page);
  		}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4586
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
4587
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4588
  	}
45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
4589
4590
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4591
4592
4593
4594
  retry:
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; addr += PAGE_SIZE) {
  		pte_t ptent = *(pte++);
024914477   Daisuke Nishimura   memcg: move charg...
4595
  		swp_entry_t ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4596
4597
4598
  
  		if (!mc.precharge)
  			break;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4599
  		switch (get_mctgt_type(vma, addr, ptent, &target)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4600
4601
  		case MC_TARGET_PAGE:
  			page = target.page;
53f9263ba   Kirill A. Shutemov   mm: rework mapcou...
4602
4603
4604
4605
4606
4607
4608
4609
  			/*
  			 * We can have a part of the split pmd here. Moving it
  			 * can be done but it would be too convoluted so simply
  			 * ignore such a partial THP and keep it in original
  			 * memcg. There should be somebody mapping the head.
  			 */
  			if (PageTransCompound(page))
  				goto put;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4610
4611
  			if (isolate_lru_page(page))
  				goto put;
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
4612
4613
  			if (!mem_cgroup_move_account(page, false,
  						mc.from, mc.to)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4614
  				mc.precharge--;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4615
4616
  				/* we uncharge from mc.from later. */
  				mc.moved_charge++;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4617
4618
  			}
  			putback_lru_page(page);
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4619
  put:			/* get_mctgt_type() gets the page */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4620
4621
  			put_page(page);
  			break;
024914477   Daisuke Nishimura   memcg: move charg...
4622
4623
  		case MC_TARGET_SWAP:
  			ent = target.ent;
e91cbb425   Hugh Dickins   memcg swap: mem_c...
4624
  			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
024914477   Daisuke Nishimura   memcg: move charg...
4625
  				mc.precharge--;
483c30b51   Daisuke Nishimura   memcg: improve pe...
4626
4627
4628
  				/* we fixup refcnts and charges later. */
  				mc.moved_swap++;
  			}
024914477   Daisuke Nishimura   memcg: move charg...
4629
  			break;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
  		default:
  			break;
  		}
  	}
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
  
  	if (addr != end) {
  		/*
  		 * We have consumed all precharges we got in can_attach().
  		 * We try charge one by one, but don't do any additional
  		 * charges to mc.to if we have failed in charge once in attach()
  		 * phase.
  		 */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4644
  		ret = mem_cgroup_do_precharge(1);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4645
4646
4647
4648
4649
4650
  		if (!ret)
  			goto retry;
  	}
  
  	return ret;
  }
264a0ae16   Tejun Heo   memcg: relocate c...
4651
  static void mem_cgroup_move_charge(void)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4652
  {
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4653
4654
  	struct mm_walk mem_cgroup_move_charge_walk = {
  		.pmd_entry = mem_cgroup_move_charge_pte_range,
264a0ae16   Tejun Heo   memcg: relocate c...
4655
  		.mm = mc.mm,
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4656
  	};
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4657
4658
  
  	lru_add_drain_all();
312722cbb   Johannes Weiner   mm: memcontrol: s...
4659
  	/*
81f8c3a46   Johannes Weiner   mm: memcontrol: g...
4660
4661
4662
  	 * Signal lock_page_memcg() to take the memcg's move_lock
  	 * while we're moving its pages to another memcg. Then wait
  	 * for already started RCU-only updates to finish.
312722cbb   Johannes Weiner   mm: memcontrol: s...
4663
4664
4665
  	 */
  	atomic_inc(&mc.from->moving_account);
  	synchronize_rcu();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4666
  retry:
264a0ae16   Tejun Heo   memcg: relocate c...
4667
  	if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
  		/*
  		 * Someone who are holding the mmap_sem might be waiting in
  		 * waitq. So we cancel all extra charges, wake up all waiters,
  		 * and retry. Because we cancel precharges, we might not be able
  		 * to move enough charges, but moving charge is a best-effort
  		 * feature anyway, so it wouldn't be a big problem.
  		 */
  		__mem_cgroup_clear_mc();
  		cond_resched();
  		goto retry;
  	}
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4679
4680
4681
4682
4683
  	/*
  	 * When we have consumed all precharges and failed in doing
  	 * additional charge, the page walk just aborts.
  	 */
  	walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
264a0ae16   Tejun Heo   memcg: relocate c...
4684
  	up_read(&mc.mm->mmap_sem);
312722cbb   Johannes Weiner   mm: memcontrol: s...
4685
  	atomic_dec(&mc.from->moving_account);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4686
  }
264a0ae16   Tejun Heo   memcg: relocate c...
4687
  static void mem_cgroup_move_task(void)
67e465a77   Balbir Singh   Memory controller...
4688
  {
264a0ae16   Tejun Heo   memcg: relocate c...
4689
4690
  	if (mc.to) {
  		mem_cgroup_move_charge();
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
4691
  		mem_cgroup_clear_mc();
264a0ae16   Tejun Heo   memcg: relocate c...
4692
  	}
67e465a77   Balbir Singh   Memory controller...
4693
  }
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4694
  #else	/* !CONFIG_MMU */
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4695
  static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4696
4697
4698
  {
  	return 0;
  }
1f7dd3e5a   Tejun Heo   cgroup: fix handl...
4699
  static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4700
4701
  {
  }
264a0ae16   Tejun Heo   memcg: relocate c...
4702
  static void mem_cgroup_move_task(void)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4703
4704
4705
  {
  }
  #endif
67e465a77   Balbir Singh   Memory controller...
4706

f00baae7a   Tejun Heo   memcg: force use_...
4707
4708
  /*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
aa6ec29be   Tejun Heo   cgroup: remove sa...
4709
4710
   * to verify whether we're attached to the default hierarchy on each mount
   * attempt.
f00baae7a   Tejun Heo   memcg: force use_...
4711
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
4712
  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
f00baae7a   Tejun Heo   memcg: force use_...
4713
4714
  {
  	/*
aa6ec29be   Tejun Heo   cgroup: remove sa...
4715
  	 * use_hierarchy is forced on the default hierarchy.  cgroup core
f00baae7a   Tejun Heo   memcg: force use_...
4716
4717
4718
  	 * guarantees that @root doesn't have any children, so turning it
  	 * on for the root memcg is enough.
  	 */
9e10a130d   Tejun Heo   cgroup: replace c...
4719
  	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7feee590b   Vladimir Davydov   memcg: disable hi...
4720
4721
4722
  		root_mem_cgroup->use_hierarchy = true;
  	else
  		root_mem_cgroup->use_hierarchy = false;
f00baae7a   Tejun Heo   memcg: force use_...
4723
  }
241994ed8   Johannes Weiner   mm: memcontrol: d...
4724
4725
4726
  static u64 memory_current_read(struct cgroup_subsys_state *css,
  			       struct cftype *cft)
  {
f5fc3c5d8   Johannes Weiner   mm: memcontrol: e...
4727
4728
4729
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
  	return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
241994ed8   Johannes Weiner   mm: memcontrol: d...
4730
4731
4732
4733
4734
  }
  
  static int memory_low_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
4735
  	unsigned long low = READ_ONCE(memcg->low);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4736
4737
  
  	if (low == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
4738
4739
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
  	else
  		seq_printf(m, "%llu
  ", (u64)low * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_low_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long low;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
4755
  	err = page_counter_memparse(buf, "max", &low);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
  	if (err)
  		return err;
  
  	memcg->low = low;
  
  	return nbytes;
  }
  
  static int memory_high_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
4767
  	unsigned long high = READ_ONCE(memcg->high);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4768
4769
  
  	if (high == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
4770
4771
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
  	else
  		seq_printf(m, "%llu
  ", (u64)high * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_high_write(struct kernfs_open_file *of,
  				 char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
588083bb3   Johannes Weiner   mm: memcontrol: r...
4783
  	unsigned long nr_pages;
241994ed8   Johannes Weiner   mm: memcontrol: d...
4784
4785
4786
4787
  	unsigned long high;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
4788
  	err = page_counter_memparse(buf, "max", &high);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4789
4790
4791
4792
  	if (err)
  		return err;
  
  	memcg->high = high;
588083bb3   Johannes Weiner   mm: memcontrol: r...
4793
4794
4795
4796
  	nr_pages = page_counter_read(&memcg->memory);
  	if (nr_pages > high)
  		try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
  					     GFP_KERNEL, true);
2529bb3aa   Tejun Heo   writeback: reset ...
4797
  	memcg_wb_domain_size_changed(memcg);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4798
4799
4800
4801
4802
4803
  	return nbytes;
  }
  
  static int memory_max_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
4804
  	unsigned long max = READ_ONCE(memcg->memory.limit);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4805
4806
  
  	if (max == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
4807
4808
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
  	else
  		seq_printf(m, "%llu
  ", (u64)max * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_max_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
b6e6edcfa   Johannes Weiner   mm: memcontrol: r...
4820
4821
  	unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
  	bool drained = false;
241994ed8   Johannes Weiner   mm: memcontrol: d...
4822
4823
4824
4825
  	unsigned long max;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
4826
  	err = page_counter_memparse(buf, "max", &max);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4827
4828
  	if (err)
  		return err;
b6e6edcfa   Johannes Weiner   mm: memcontrol: r...
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
  	xchg(&memcg->memory.limit, max);
  
  	for (;;) {
  		unsigned long nr_pages = page_counter_read(&memcg->memory);
  
  		if (nr_pages <= max)
  			break;
  
  		if (signal_pending(current)) {
  			err = -EINTR;
  			break;
  		}
  
  		if (!drained) {
  			drain_all_stock(memcg);
  			drained = true;
  			continue;
  		}
  
  		if (nr_reclaims) {
  			if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
  							  GFP_KERNEL, true))
  				nr_reclaims--;
  			continue;
  		}
  
  		mem_cgroup_events(memcg, MEMCG_OOM, 1);
  		if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
  			break;
  	}
241994ed8   Johannes Weiner   mm: memcontrol: d...
4859

2529bb3aa   Tejun Heo   writeback: reset ...
4860
  	memcg_wb_domain_size_changed(memcg);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
  	return nbytes;
  }
  
  static int memory_events_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
  
  	seq_printf(m, "low %lu
  ", mem_cgroup_read_events(memcg, MEMCG_LOW));
  	seq_printf(m, "high %lu
  ", mem_cgroup_read_events(memcg, MEMCG_HIGH));
  	seq_printf(m, "max %lu
  ", mem_cgroup_read_events(memcg, MEMCG_MAX));
  	seq_printf(m, "oom %lu
  ", mem_cgroup_read_events(memcg, MEMCG_OOM));
  
  	return 0;
  }
587d9f726   Johannes Weiner   mm: memcontrol: b...
4879
4880
4881
  static int memory_stat_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4882
4883
  	unsigned long stat[MEMCG_NR_STAT];
  	unsigned long events[MEMCG_NR_EVENTS];
587d9f726   Johannes Weiner   mm: memcontrol: b...
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
  	int i;
  
  	/*
  	 * Provide statistics on the state of the memory subsystem as
  	 * well as cumulative event counters that show past behavior.
  	 *
  	 * This list is ordered following a combination of these gradients:
  	 * 1) generic big picture -> specifics and details
  	 * 2) reflecting userspace activity -> reflecting kernel heuristics
  	 *
  	 * Current memory state:
  	 */
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4896
4897
  	tree_stat(memcg, stat);
  	tree_events(memcg, events);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4898
4899
  	seq_printf(m, "anon %llu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4900
  		   (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4901
4902
  	seq_printf(m, "file %llu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4903
  		   (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
12580e4b5   Vladimir Davydov   mm: memcontrol: r...
4904
4905
4906
  	seq_printf(m, "kernel_stack %llu
  ",
  		   (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE);
27ee57c93   Vladimir Davydov   mm: memcontrol: r...
4907
4908
4909
4910
  	seq_printf(m, "slab %llu
  ",
  		   (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
  			 stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
b2807f07f   Johannes Weiner   mm: memcontrol: a...
4911
4912
  	seq_printf(m, "sock %llu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4913
  		   (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4914
4915
4916
  
  	seq_printf(m, "file_mapped %llu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4917
  		   (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4918
4919
  	seq_printf(m, "file_dirty %llu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4920
  		   (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4921
4922
  	seq_printf(m, "file_writeback %llu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4923
  		   (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
  
  	for (i = 0; i < NR_LRU_LISTS; i++) {
  		struct mem_cgroup *mi;
  		unsigned long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_nr_lru_pages(mi, BIT(i));
  		seq_printf(m, "%s %llu
  ",
  			   mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
  	}
27ee57c93   Vladimir Davydov   mm: memcontrol: r...
4935
4936
4937
4938
4939
4940
  	seq_printf(m, "slab_reclaimable %llu
  ",
  		   (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
  	seq_printf(m, "slab_unreclaimable %llu
  ",
  		   (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4941
4942
4943
4944
  	/* Accumulated memory events */
  
  	seq_printf(m, "pgfault %lu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4945
  		   events[MEM_CGROUP_EVENTS_PGFAULT]);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4946
4947
  	seq_printf(m, "pgmajfault %lu
  ",
72b54e731   Vladimir Davydov   mm: memcontrol: m...
4948
  		   events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
587d9f726   Johannes Weiner   mm: memcontrol: b...
4949
4950
4951
  
  	return 0;
  }
241994ed8   Johannes Weiner   mm: memcontrol: d...
4952
4953
4954
  static struct cftype memory_files[] = {
  	{
  		.name = "current",
f5fc3c5d8   Johannes Weiner   mm: memcontrol: e...
4955
  		.flags = CFTYPE_NOT_ON_ROOT,
241994ed8   Johannes Weiner   mm: memcontrol: d...
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
  		.read_u64 = memory_current_read,
  	},
  	{
  		.name = "low",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_low_show,
  		.write = memory_low_write,
  	},
  	{
  		.name = "high",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_high_show,
  		.write = memory_high_write,
  	},
  	{
  		.name = "max",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_max_show,
  		.write = memory_max_write,
  	},
  	{
  		.name = "events",
  		.flags = CFTYPE_NOT_ON_ROOT,
472912a2b   Tejun Heo   memcg: generate f...
4979
  		.file_offset = offsetof(struct mem_cgroup, events_file),
241994ed8   Johannes Weiner   mm: memcontrol: d...
4980
4981
  		.seq_show = memory_events_show,
  	},
587d9f726   Johannes Weiner   mm: memcontrol: b...
4982
4983
4984
4985
4986
  	{
  		.name = "stat",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_stat_show,
  	},
241994ed8   Johannes Weiner   mm: memcontrol: d...
4987
4988
  	{ }	/* terminate */
  };
073219e99   Tejun Heo   cgroup: clean up ...
4989
  struct cgroup_subsys memory_cgrp_subsys = {
92fb97487   Tejun Heo   cgroup: rename ->...
4990
  	.css_alloc = mem_cgroup_css_alloc,
d142e3e66   Glauber Costa   memcg: split part...
4991
  	.css_online = mem_cgroup_css_online,
92fb97487   Tejun Heo   cgroup: rename ->...
4992
  	.css_offline = mem_cgroup_css_offline,
6df38689e   Vladimir Davydov   mm: memcontrol: f...
4993
  	.css_released = mem_cgroup_css_released,
92fb97487   Tejun Heo   cgroup: rename ->...
4994
  	.css_free = mem_cgroup_css_free,
1ced953b1   Tejun Heo   blkcg, memcg: mak...
4995
  	.css_reset = mem_cgroup_css_reset,
7dc74be03   Daisuke Nishimura   memcg: add interf...
4996
4997
  	.can_attach = mem_cgroup_can_attach,
  	.cancel_attach = mem_cgroup_cancel_attach,
264a0ae16   Tejun Heo   memcg: relocate c...
4998
  	.post_attach = mem_cgroup_move_task,
f00baae7a   Tejun Heo   memcg: force use_...
4999
  	.bind = mem_cgroup_bind,
241994ed8   Johannes Weiner   mm: memcontrol: d...
5000
5001
  	.dfl_cftypes = memory_files,
  	.legacy_cftypes = mem_cgroup_legacy_files,
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5002
  	.early_init = 0,
8cdea7c05   Balbir Singh   Memory controller...
5003
  };
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5004

241994ed8   Johannes Weiner   mm: memcontrol: d...
5005
  /**
241994ed8   Johannes Weiner   mm: memcontrol: d...
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
   * mem_cgroup_low - check if memory consumption is below the normal range
   * @root: the highest ancestor to consider
   * @memcg: the memory cgroup to check
   *
   * Returns %true if memory consumption of @memcg, and that of all
   * configurable ancestors up to @root, is below the normal range.
   */
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
  {
  	if (mem_cgroup_disabled())
  		return false;
  
  	/*
  	 * The toplevel group doesn't have a configurable range, so
  	 * it's never low when looked at directly, and it is not
  	 * considered an ancestor when assessing the hierarchy.
  	 */
  
  	if (memcg == root_mem_cgroup)
  		return false;
4e54dede3   Michal Hocko   memcg: fix low li...
5026
  	if (page_counter_read(&memcg->memory) >= memcg->low)
241994ed8   Johannes Weiner   mm: memcontrol: d...
5027
5028
5029
5030
5031
5032
5033
  		return false;
  
  	while (memcg != root) {
  		memcg = parent_mem_cgroup(memcg);
  
  		if (memcg == root_mem_cgroup)
  			break;
4e54dede3   Michal Hocko   memcg: fix low li...
5034
  		if (page_counter_read(&memcg->memory) >= memcg->low)
241994ed8   Johannes Weiner   mm: memcontrol: d...
5035
5036
5037
5038
  			return false;
  	}
  	return true;
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
  /**
   * mem_cgroup_try_charge - try charging a page
   * @page: page to charge
   * @mm: mm context of the victim
   * @gfp_mask: reclaim mode
   * @memcgp: charged memcg return
   *
   * Try to charge @page to the memcg that @mm belongs to, reclaiming
   * pages according to @gfp_mask if necessary.
   *
   * Returns 0 on success, with *@memcgp pointing to the charged memcg.
   * Otherwise, an error code is returned.
   *
   * After page->mapping has been set up, the caller must finalize the
   * charge with mem_cgroup_commit_charge().  Or abort the transaction
   * with mem_cgroup_cancel_charge() in case page instantiation fails.
   */
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5057
5058
  			  gfp_t gfp_mask, struct mem_cgroup **memcgp,
  			  bool compound)
00501b531   Johannes Weiner   mm: memcontrol: r...
5059
5060
  {
  	struct mem_cgroup *memcg = NULL;
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5061
  	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
00501b531   Johannes Weiner   mm: memcontrol: r...
5062
5063
5064
5065
5066
5067
  	int ret = 0;
  
  	if (mem_cgroup_disabled())
  		goto out;
  
  	if (PageSwapCache(page)) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5068
5069
5070
5071
5072
5073
5074
  		/*
  		 * Every swap fault against a single page tries to charge the
  		 * page, bail as early as possible.  shmem_unuse() encounters
  		 * already charged pages, too.  The USED bit is protected by
  		 * the page lock, which serializes swap cache removal, which
  		 * in turn serializes uncharging.
  		 */
e993d905c   Vladimir Davydov   memcg: zap try_ge...
5075
  		VM_BUG_ON_PAGE(!PageLocked(page), page);
1306a85ae   Johannes Weiner   mm: embed the mem...
5076
  		if (page->mem_cgroup)
00501b531   Johannes Weiner   mm: memcontrol: r...
5077
  			goto out;
e993d905c   Vladimir Davydov   memcg: zap try_ge...
5078

37e843511   Vladimir Davydov   mm: memcontrol: c...
5079
  		if (do_swap_account) {
e993d905c   Vladimir Davydov   memcg: zap try_ge...
5080
5081
5082
5083
5084
5085
5086
5087
5088
  			swp_entry_t ent = { .val = page_private(page), };
  			unsigned short id = lookup_swap_cgroup_id(ent);
  
  			rcu_read_lock();
  			memcg = mem_cgroup_from_id(id);
  			if (memcg && !css_tryget_online(&memcg->css))
  				memcg = NULL;
  			rcu_read_unlock();
  		}
00501b531   Johannes Weiner   mm: memcontrol: r...
5089
  	}
00501b531   Johannes Weiner   mm: memcontrol: r...
5090
5091
5092
5093
5094
5095
  	if (!memcg)
  		memcg = get_mem_cgroup_from_mm(mm);
  
  	ret = try_charge(memcg, gfp_mask, nr_pages);
  
  	css_put(&memcg->css);
00501b531   Johannes Weiner   mm: memcontrol: r...
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
  out:
  	*memcgp = memcg;
  	return ret;
  }
  
  /**
   * mem_cgroup_commit_charge - commit a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   * @lrucare: page might be on LRU already
   *
   * Finalize a charge transaction started by mem_cgroup_try_charge(),
   * after page->mapping has been set up.  This must happen atomically
   * as part of the page instantiation, i.e. under the page table lock
   * for anonymous pages, under the page lock for page and swap cache.
   *
   * In addition, the page must not be on the LRU during the commit, to
   * prevent racing with task migration.  If it might be, use @lrucare.
   *
   * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
   */
  void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5118
  			      bool lrucare, bool compound)
00501b531   Johannes Weiner   mm: memcontrol: r...
5119
  {
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5120
  	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
00501b531   Johannes Weiner   mm: memcontrol: r...
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
  
  	VM_BUG_ON_PAGE(!page->mapping, page);
  	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5134
  	commit_charge(page, memcg, lrucare);
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5135
  	local_irq_disable();
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5136
  	mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5137
5138
  	memcg_check_events(memcg, page);
  	local_irq_enable();
00501b531   Johannes Weiner   mm: memcontrol: r...
5139

7941d2145   Johannes Weiner   mm: memcontrol: d...
5140
  	if (do_memsw_account() && PageSwapCache(page)) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
  		swp_entry_t entry = { .val = page_private(page) };
  		/*
  		 * The swap entry might not get freed for a long time,
  		 * let's not wait for it.  The page already received a
  		 * memory+swap charge, drop the swap entry duplicate.
  		 */
  		mem_cgroup_uncharge_swap(entry);
  	}
  }
  
  /**
   * mem_cgroup_cancel_charge - cancel a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   *
   * Cancel a charge transaction started by mem_cgroup_try_charge().
   */
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5158
5159
  void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
  		bool compound)
00501b531   Johannes Weiner   mm: memcontrol: r...
5160
  {
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5161
  	unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
00501b531   Johannes Weiner   mm: memcontrol: r...
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
00501b531   Johannes Weiner   mm: memcontrol: r...
5172
5173
  	cancel_charge(memcg, nr_pages);
  }
747db954c   Johannes Weiner   mm: memcontrol: u...
5174
  static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
747db954c   Johannes Weiner   mm: memcontrol: u...
5175
5176
5177
  			   unsigned long nr_anon, unsigned long nr_file,
  			   unsigned long nr_huge, struct page *dummy_page)
  {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5178
  	unsigned long nr_pages = nr_anon + nr_file;
747db954c   Johannes Weiner   mm: memcontrol: u...
5179
  	unsigned long flags;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5180
  	if (!mem_cgroup_is_root(memcg)) {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5181
  		page_counter_uncharge(&memcg->memory, nr_pages);
7941d2145   Johannes Weiner   mm: memcontrol: d...
5182
  		if (do_memsw_account())
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5183
  			page_counter_uncharge(&memcg->memsw, nr_pages);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5184
5185
  		memcg_oom_recover(memcg);
  	}
747db954c   Johannes Weiner   mm: memcontrol: u...
5186
5187
5188
5189
5190
5191
  
  	local_irq_save(flags);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
  	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5192
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
747db954c   Johannes Weiner   mm: memcontrol: u...
5193
5194
  	memcg_check_events(memcg, dummy_page);
  	local_irq_restore(flags);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
5195
5196
  
  	if (!mem_cgroup_is_root(memcg))
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5197
  		css_put_many(&memcg->css, nr_pages);
747db954c   Johannes Weiner   mm: memcontrol: u...
5198
5199
5200
5201
5202
  }
  
  static void uncharge_list(struct list_head *page_list)
  {
  	struct mem_cgroup *memcg = NULL;
747db954c   Johannes Weiner   mm: memcontrol: u...
5203
5204
5205
5206
  	unsigned long nr_anon = 0;
  	unsigned long nr_file = 0;
  	unsigned long nr_huge = 0;
  	unsigned long pgpgout = 0;
747db954c   Johannes Weiner   mm: memcontrol: u...
5207
5208
  	struct list_head *next;
  	struct page *page;
8b5926560   Johannes Weiner   mm: memcontrol: c...
5209
5210
5211
5212
  	/*
  	 * Note that the list can be a single page->lru; hence the
  	 * do-while loop instead of a simple list_for_each_entry().
  	 */
747db954c   Johannes Weiner   mm: memcontrol: u...
5213
5214
5215
  	next = page_list->next;
  	do {
  		unsigned int nr_pages = 1;
747db954c   Johannes Weiner   mm: memcontrol: u...
5216
5217
5218
5219
5220
5221
  
  		page = list_entry(next, struct page, lru);
  		next = page->lru.next;
  
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		VM_BUG_ON_PAGE(page_count(page), page);
1306a85ae   Johannes Weiner   mm: embed the mem...
5222
  		if (!page->mem_cgroup)
747db954c   Johannes Weiner   mm: memcontrol: u...
5223
5224
5225
5226
  			continue;
  
  		/*
  		 * Nobody should be changing or seriously looking at
1306a85ae   Johannes Weiner   mm: embed the mem...
5227
  		 * page->mem_cgroup at this point, we have fully
298333157   Johannes Weiner   mm: memcontrol: r...
5228
  		 * exclusive access to the page.
747db954c   Johannes Weiner   mm: memcontrol: u...
5229
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
5230
  		if (memcg != page->mem_cgroup) {
747db954c   Johannes Weiner   mm: memcontrol: u...
5231
  			if (memcg) {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5232
5233
5234
  				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
  					       nr_huge, page);
  				pgpgout = nr_anon = nr_file = nr_huge = 0;
747db954c   Johannes Weiner   mm: memcontrol: u...
5235
  			}
1306a85ae   Johannes Weiner   mm: embed the mem...
5236
  			memcg = page->mem_cgroup;
747db954c   Johannes Weiner   mm: memcontrol: u...
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
  		}
  
  		if (PageTransHuge(page)) {
  			nr_pages <<= compound_order(page);
  			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  			nr_huge += nr_pages;
  		}
  
  		if (PageAnon(page))
  			nr_anon += nr_pages;
  		else
  			nr_file += nr_pages;
1306a85ae   Johannes Weiner   mm: embed the mem...
5249
  		page->mem_cgroup = NULL;
747db954c   Johannes Weiner   mm: memcontrol: u...
5250
5251
5252
5253
5254
  
  		pgpgout++;
  	} while (next != page_list);
  
  	if (memcg)
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5255
5256
  		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
  			       nr_huge, page);
747db954c   Johannes Weiner   mm: memcontrol: u...
5257
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5258
5259
5260
5261
5262
5263
5264
5265
5266
  /**
   * mem_cgroup_uncharge - uncharge a page
   * @page: page to uncharge
   *
   * Uncharge a page previously charged with mem_cgroup_try_charge() and
   * mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge(struct page *page)
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5267
5268
  	if (mem_cgroup_disabled())
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
5269
  	/* Don't touch page->lru of any random page, pre-check: */
1306a85ae   Johannes Weiner   mm: embed the mem...
5270
  	if (!page->mem_cgroup)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5271
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
5272
5273
5274
  	INIT_LIST_HEAD(&page->lru);
  	uncharge_list(&page->lru);
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5275

747db954c   Johannes Weiner   mm: memcontrol: u...
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
  /**
   * mem_cgroup_uncharge_list - uncharge a list of page
   * @page_list: list of pages to uncharge
   *
   * Uncharge a list of pages previously charged with
   * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge_list(struct list_head *page_list)
  {
  	if (mem_cgroup_disabled())
  		return;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5287

747db954c   Johannes Weiner   mm: memcontrol: u...
5288
5289
  	if (!list_empty(page_list))
  		uncharge_list(page_list);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5290
5291
5292
  }
  
  /**
6a93ca8fd   Johannes Weiner   mm: migrate: do n...
5293
5294
5295
   * mem_cgroup_migrate - charge a page's replacement
   * @oldpage: currently circulating page
   * @newpage: replacement page
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5296
   *
6a93ca8fd   Johannes Weiner   mm: migrate: do n...
5297
5298
   * Charge @newpage as a replacement page for @oldpage. @oldpage will
   * be uncharged upon free.
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5299
5300
5301
   *
   * Both pages must be locked, @newpage->mapping must be set up.
   */
6a93ca8fd   Johannes Weiner   mm: migrate: do n...
5302
  void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5303
  {
298333157   Johannes Weiner   mm: memcontrol: r...
5304
  	struct mem_cgroup *memcg;
44b7a8d33   Johannes Weiner   mm: memcontrol: d...
5305
5306
  	unsigned int nr_pages;
  	bool compound;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5307
5308
5309
  
  	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
  	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5310
  	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5311
5312
  	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
  		       newpage);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5313
5314
5315
5316
5317
  
  	if (mem_cgroup_disabled())
  		return;
  
  	/* Page cache replacement: new page already charged? */
1306a85ae   Johannes Weiner   mm: embed the mem...
5318
  	if (newpage->mem_cgroup)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5319
  		return;
45637bab3   Hugh Dickins   mm: rename mem_cg...
5320
  	/* Swapcache readahead pages can get replaced before being charged */
1306a85ae   Johannes Weiner   mm: embed the mem...
5321
  	memcg = oldpage->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
5322
  	if (!memcg)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5323
  		return;
44b7a8d33   Johannes Weiner   mm: memcontrol: d...
5324
5325
5326
5327
5328
5329
5330
5331
  	/* Force-charge the new page. The old one will be freed soon */
  	compound = PageTransHuge(newpage);
  	nr_pages = compound ? hpage_nr_pages(newpage) : 1;
  
  	page_counter_charge(&memcg->memory, nr_pages);
  	if (do_memsw_account())
  		page_counter_charge(&memcg->memsw, nr_pages);
  	css_get_many(&memcg->css, nr_pages);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5332

9cf7666ac   Johannes Weiner   mm: memcontrol: d...
5333
  	commit_charge(newpage, memcg, false);
44b7a8d33   Johannes Weiner   mm: memcontrol: d...
5334
5335
5336
5337
5338
  
  	local_irq_disable();
  	mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
  	memcg_check_events(memcg, newpage);
  	local_irq_enable();
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5339
  }
ef12947c9   Johannes Weiner   mm: memcontrol: s...
5340
  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
110920876   Johannes Weiner   mm: memcontrol: m...
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
  EXPORT_SYMBOL(memcg_sockets_enabled_key);
  
  void sock_update_memcg(struct sock *sk)
  {
  	struct mem_cgroup *memcg;
  
  	/* Socket cloning can throw us here with sk_cgrp already
  	 * filled. It won't however, necessarily happen from
  	 * process context. So the test for root memcg given
  	 * the current task's memcg won't help us in this case.
  	 *
  	 * Respecting the original socket's memcg is a better
  	 * decision in this case.
  	 */
  	if (sk->sk_memcg) {
  		BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
  		css_get(&sk->sk_memcg->css);
  		return;
  	}
  
  	rcu_read_lock();
  	memcg = mem_cgroup_from_task(current);
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5363
5364
  	if (memcg == root_mem_cgroup)
  		goto out;
0db152981   Johannes Weiner   mm: memcontrol: f...
5365
  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5366
  		goto out;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5367
  	if (css_tryget_online(&memcg->css))
110920876   Johannes Weiner   mm: memcontrol: m...
5368
  		sk->sk_memcg = memcg;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5369
  out:
110920876   Johannes Weiner   mm: memcontrol: m...
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
  	rcu_read_unlock();
  }
  EXPORT_SYMBOL(sock_update_memcg);
  
  void sock_release_memcg(struct sock *sk)
  {
  	WARN_ON(!sk->sk_memcg);
  	css_put(&sk->sk_memcg->css);
  }
  
  /**
   * mem_cgroup_charge_skmem - charge socket memory
   * @memcg: memcg to charge
   * @nr_pages: number of pages to charge
   *
   * Charges @nr_pages to @memcg. Returns %true if the charge fit within
   * @memcg's configured limit, %false if the charge had to be forced.
   */
  bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5390
  	gfp_t gfp_mask = GFP_KERNEL;
110920876   Johannes Weiner   mm: memcontrol: m...
5391

f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5392
  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
0db152981   Johannes Weiner   mm: memcontrol: f...
5393
  		struct page_counter *fail;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5394

0db152981   Johannes Weiner   mm: memcontrol: f...
5395
5396
  		if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
  			memcg->tcpmem_pressure = 0;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5397
5398
  			return true;
  		}
0db152981   Johannes Weiner   mm: memcontrol: f...
5399
5400
  		page_counter_charge(&memcg->tcpmem, nr_pages);
  		memcg->tcpmem_pressure = 1;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5401
  		return false;
110920876   Johannes Weiner   mm: memcontrol: m...
5402
  	}
d886f4e48   Johannes Weiner   mm: memcontrol: r...
5403

f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5404
5405
5406
  	/* Don't block in the packet receive path */
  	if (in_softirq())
  		gfp_mask = GFP_NOWAIT;
b2807f07f   Johannes Weiner   mm: memcontrol: a...
5407
  	this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5408
5409
5410
5411
  	if (try_charge(memcg, gfp_mask, nr_pages) == 0)
  		return true;
  
  	try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
110920876   Johannes Weiner   mm: memcontrol: m...
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
  	return false;
  }
  
  /**
   * mem_cgroup_uncharge_skmem - uncharge socket memory
   * @memcg - memcg to uncharge
   * @nr_pages - number of pages to uncharge
   */
  void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5422
  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
0db152981   Johannes Weiner   mm: memcontrol: f...
5423
  		page_counter_uncharge(&memcg->tcpmem, nr_pages);
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5424
5425
  		return;
  	}
d886f4e48   Johannes Weiner   mm: memcontrol: r...
5426

b2807f07f   Johannes Weiner   mm: memcontrol: a...
5427
  	this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5428
5429
  	page_counter_uncharge(&memcg->memory, nr_pages);
  	css_put_many(&memcg->css, nr_pages);
110920876   Johannes Weiner   mm: memcontrol: m...
5430
  }
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5431
5432
5433
5434
5435
5436
5437
5438
5439
  static int __init cgroup_memory(char *s)
  {
  	char *token;
  
  	while ((token = strsep(&s, ",")) != NULL) {
  		if (!*token)
  			continue;
  		if (!strcmp(token, "nosocket"))
  			cgroup_memory_nosocket = true;
04823c833   Vladimir Davydov   mm: memcontrol: a...
5440
5441
  		if (!strcmp(token, "nokmem"))
  			cgroup_memory_nokmem = true;
f7e1cb6ec   Johannes Weiner   mm: memcontrol: a...
5442
5443
5444
5445
  	}
  	return 0;
  }
  __setup("cgroup.memory=", cgroup_memory);
110920876   Johannes Weiner   mm: memcontrol: m...
5446

2d11085e4   Michal Hocko   memcg: do not cre...
5447
  /*
1081312f9   Michal Hocko   memcg: cleanup me...
5448
5449
5450
5451
5452
5453
   * subsys_initcall() for memory controller.
   *
   * Some parts like hotcpu_notifier() have to be initialized from this context
   * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
   * everything that doesn't depend on a specific mem_cgroup structure should
   * be initialized from here.
2d11085e4   Michal Hocko   memcg: do not cre...
5454
5455
5456
   */
  static int __init mem_cgroup_init(void)
  {
95a045f63   Johannes Weiner   mm: memcontrol: c...
5457
  	int cpu, node;
2d11085e4   Michal Hocko   memcg: do not cre...
5458
  	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
95a045f63   Johannes Weiner   mm: memcontrol: c...
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
  
  	for_each_possible_cpu(cpu)
  		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
  			  drain_local_stock);
  
  	for_each_node(node) {
  		struct mem_cgroup_tree_per_node *rtpn;
  		int zone;
  
  		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
  				    node_online(node) ? node : NUMA_NO_NODE);
  
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  			struct mem_cgroup_tree_per_zone *rtpz;
  
  			rtpz = &rtpn->rb_tree_per_zone[zone];
  			rtpz->rb_root = RB_ROOT;
  			spin_lock_init(&rtpz->lock);
  		}
  		soft_limit_tree.rb_tree_per_node[node] = rtpn;
  	}
2d11085e4   Michal Hocko   memcg: do not cre...
5480
5481
5482
  	return 0;
  }
  subsys_initcall(mem_cgroup_init);
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
  
  #ifdef CONFIG_MEMCG_SWAP
  /**
   * mem_cgroup_swapout - transfer a memsw charge to swap
   * @page: page whose memsw charge to transfer
   * @entry: swap entry to move the charge to
   *
   * Transfer the memsw charge of @page to @entry.
   */
  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	unsigned short oldid;
  
  	VM_BUG_ON_PAGE(PageLRU(page), page);
  	VM_BUG_ON_PAGE(page_count(page), page);
7941d2145   Johannes Weiner   mm: memcontrol: d...
5499
  	if (!do_memsw_account())
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
  		return;
  
  	memcg = page->mem_cgroup;
  
  	/* Readahead page, never charged */
  	if (!memcg)
  		return;
  
  	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
  	VM_BUG_ON_PAGE(oldid, page);
  	mem_cgroup_swap_statistics(memcg, true);
  
  	page->mem_cgroup = NULL;
  
  	if (!mem_cgroup_is_root(memcg))
  		page_counter_uncharge(&memcg->memory, 1);
ce9ce6659   Sebastian Andrzej Siewior   mm: memcontrol: b...
5516
5517
5518
5519
5520
5521
5522
  	/*
  	 * Interrupts should be disabled here because the caller holds the
  	 * mapping->tree_lock lock which is taken with interrupts-off. It is
  	 * important here to have the interrupts disabled because it is the
  	 * only synchronisation we have for udpating the per-CPU variables.
  	 */
  	VM_BUG_ON(!irqs_disabled());
f627c2f53   Kirill A. Shutemov   memcg: adjust to ...
5523
  	mem_cgroup_charge_statistics(memcg, page, false, -1);
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5524
5525
  	memcg_check_events(memcg, page);
  }
37e843511   Vladimir Davydov   mm: memcontrol: c...
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
  /*
   * mem_cgroup_try_charge_swap - try charging a swap entry
   * @page: page being added to swap
   * @entry: swap entry to charge
   *
   * Try to charge @entry to the memcg that @page belongs to.
   *
   * Returns 0 on success, -ENOMEM on failure.
   */
  int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	struct page_counter *counter;
  	unsigned short oldid;
  
  	if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
  		return 0;
  
  	memcg = page->mem_cgroup;
  
  	/* Readahead page, never charged */
  	if (!memcg)
  		return 0;
  
  	if (!mem_cgroup_is_root(memcg) &&
  	    !page_counter_try_charge(&memcg->swap, 1, &counter))
  		return -ENOMEM;
  
  	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
  	VM_BUG_ON_PAGE(oldid, page);
  	mem_cgroup_swap_statistics(memcg, true);
  
  	css_get(&memcg->css);
  	return 0;
  }
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5561
5562
5563
5564
  /**
   * mem_cgroup_uncharge_swap - uncharge a swap entry
   * @entry: swap entry to uncharge
   *
37e843511   Vladimir Davydov   mm: memcontrol: c...
5565
   * Drop the swap charge associated with @entry.
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5566
5567
5568
5569
5570
   */
  void mem_cgroup_uncharge_swap(swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	unsigned short id;
37e843511   Vladimir Davydov   mm: memcontrol: c...
5571
  	if (!do_swap_account)
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5572
5573
5574
5575
  		return;
  
  	id = swap_cgroup_record(entry, 0);
  	rcu_read_lock();
adbe427b9   Vladimir Davydov   memcg: zap mem_cg...
5576
  	memcg = mem_cgroup_from_id(id);
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5577
  	if (memcg) {
37e843511   Vladimir Davydov   mm: memcontrol: c...
5578
5579
5580
5581
5582
5583
  		if (!mem_cgroup_is_root(memcg)) {
  			if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
  				page_counter_uncharge(&memcg->swap, 1);
  			else
  				page_counter_uncharge(&memcg->memsw, 1);
  		}
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5584
5585
5586
5587
5588
  		mem_cgroup_swap_statistics(memcg, false);
  		css_put(&memcg->css);
  	}
  	rcu_read_unlock();
  }
d8b38438a   Vladimir Davydov   mm: vmscan: do no...
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
  long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
  {
  	long nr_swap_pages = get_nr_swap_pages();
  
  	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
  		return nr_swap_pages;
  	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
  		nr_swap_pages = min_t(long, nr_swap_pages,
  				      READ_ONCE(memcg->swap.limit) -
  				      page_counter_read(&memcg->swap));
  	return nr_swap_pages;
  }
5ccc5abaa   Vladimir Davydov   mm: free swap cac...
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
  bool mem_cgroup_swap_full(struct page *page)
  {
  	struct mem_cgroup *memcg;
  
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
  
  	if (vm_swap_full())
  		return true;
  	if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
  		return false;
  
  	memcg = page->mem_cgroup;
  	if (!memcg)
  		return false;
  
  	for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
  		if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
  			return true;
  
  	return false;
  }
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
  /* for remember boot option*/
  #ifdef CONFIG_MEMCG_SWAP_ENABLED
  static int really_do_swap_account __initdata = 1;
  #else
  static int really_do_swap_account __initdata;
  #endif
  
  static int __init enable_swap_account(char *s)
  {
  	if (!strcmp(s, "1"))
  		really_do_swap_account = 1;
  	else if (!strcmp(s, "0"))
  		really_do_swap_account = 0;
  	return 1;
  }
  __setup("swapaccount=", enable_swap_account);
37e843511   Vladimir Davydov   mm: memcontrol: c...
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
  static u64 swap_current_read(struct cgroup_subsys_state *css,
  			     struct cftype *cft)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
  	return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
  }
  
  static int swap_max_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
  	unsigned long max = READ_ONCE(memcg->swap.limit);
  
  	if (max == PAGE_COUNTER_MAX)
  		seq_puts(m, "max
  ");
  	else
  		seq_printf(m, "%llu
  ", (u64)max * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t swap_max_write(struct kernfs_open_file *of,
  			      char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long max;
  	int err;
  
  	buf = strstrip(buf);
  	err = page_counter_memparse(buf, "max", &max);
  	if (err)
  		return err;
  
  	mutex_lock(&memcg_limit_mutex);
  	err = page_counter_limit(&memcg->swap, max);
  	mutex_unlock(&memcg_limit_mutex);
  	if (err)
  		return err;
  
  	return nbytes;
  }
  
  static struct cftype swap_files[] = {
  	{
  		.name = "swap.current",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.read_u64 = swap_current_read,
  	},
  	{
  		.name = "swap.max",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = swap_max_show,
  		.write = swap_max_write,
  	},
  	{ }	/* terminate */
  };
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
  static struct cftype memsw_cgroup_files[] = {
  	{
  		.name = "memsw.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
  		.write = mem_cgroup_write,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.failcnt",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{ },	/* terminate */
  };
  
  static int __init mem_cgroup_swap_init(void)
  {
  	if (!mem_cgroup_disabled() && really_do_swap_account) {
  		do_swap_account = 1;
37e843511   Vladimir Davydov   mm: memcontrol: c...
5727
5728
  		WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
  					       swap_files));
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5729
5730
5731
5732
5733
5734
5735
5736
  		WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
  						  memsw_cgroup_files));
  	}
  	return 0;
  }
  subsys_initcall(mem_cgroup_swap_init);
  
  #endif /* CONFIG_MEMCG_SWAP */