Blame view

mm/memcontrol.c 150 KB
8cdea7c05   Balbir Singh   Memory controller...
1
2
3
4
5
  /* memcontrol.c - Memory Controller
   *
   * Copyright IBM Corporation, 2007
   * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   *
78fb74669   Pavel Emelianov   Memory controller...
6
7
8
   * Copyright 2007 OpenVZ SWsoft Inc
   * Author: Pavel Emelianov <xemul@openvz.org>
   *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
9
10
11
12
   * Memory thresholds
   * Copyright (C) 2009 Nokia Corporation
   * Author: Kirill A. Shutemov
   *
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
13
14
15
16
   * Kernel Memory Controller
   * Copyright (C) 2012 Parallels Inc. and Google Inc.
   * Authors: Glauber Costa and Suleiman Souhlal
   *
1575e68b3   Johannes Weiner   mm: memcontrol: u...
17
18
19
20
21
22
   * Native page reclaim
   * Charge lifetime sanitation
   * Lockless page tracking & accounting
   * Unified hierarchy configuration model
   * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
   *
8cdea7c05   Balbir Singh   Memory controller...
23
24
25
26
27
28
29
30
31
32
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
33
  #include <linux/page_counter.h>
8cdea7c05   Balbir Singh   Memory controller...
34
35
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
78fb74669   Pavel Emelianov   Memory controller...
36
  #include <linux/mm.h>
4ffef5fef   Daisuke Nishimura   memcg: move charg...
37
  #include <linux/hugetlb.h>
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
38
  #include <linux/pagemap.h>
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
39
  #include <linux/smp.h>
8a9f3ccd2   Balbir Singh   Memory controller...
40
  #include <linux/page-flags.h>
66e1707bc   Balbir Singh   Memory controller...
41
  #include <linux/backing-dev.h>
8a9f3ccd2   Balbir Singh   Memory controller...
42
43
  #include <linux/bit_spinlock.h>
  #include <linux/rcupdate.h>
e222432bf   Balbir Singh   memcg: show memcg...
44
  #include <linux/limits.h>
b9e15bafd   Paul Gortmaker   mm: Add export.h ...
45
  #include <linux/export.h>
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
46
  #include <linux/mutex.h>
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
47
  #include <linux/rbtree.h>
b6ac57d50   Balbir Singh   memcgroup: move m...
48
  #include <linux/slab.h>
66e1707bc   Balbir Singh   Memory controller...
49
  #include <linux/swap.h>
024914477   Daisuke Nishimura   memcg: move charg...
50
  #include <linux/swapops.h>
66e1707bc   Balbir Singh   Memory controller...
51
  #include <linux/spinlock.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
52
  #include <linux/eventfd.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
53
  #include <linux/poll.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
54
  #include <linux/sort.h>
66e1707bc   Balbir Singh   Memory controller...
55
  #include <linux/fs.h>
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
56
  #include <linux/seq_file.h>
70ddf637e   Anton Vorontsov   memcg: add memory...
57
  #include <linux/vmpressure.h>
b69408e88   Christoph Lameter   vmscan: Use an in...
58
  #include <linux/mm_inline.h>
5d1ea48bd   Johannes Weiner   mm: page_cgroup: ...
59
  #include <linux/swap_cgroup.h>
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
60
  #include <linux/cpu.h>
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
61
  #include <linux/oom.h>
0056f4e66   Johannes Weiner   mm: memcg: lockde...
62
  #include <linux/lockdep.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
63
  #include <linux/file.h>
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
64
  #include "internal.h"
d1a4c0b37   Glauber Costa   tcp memory pressu...
65
  #include <net/sock.h>
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
66
  #include <net/ip.h>
d1a4c0b37   Glauber Costa   tcp memory pressu...
67
  #include <net/tcp_memcontrol.h>
f35c3a8ee   Qiang Huang   memcg, kmem: use ...
68
  #include "slab.h"
8cdea7c05   Balbir Singh   Memory controller...
69

8697d3319   Balbir Singh   Memory controller...
70
  #include <asm/uaccess.h>
cc8e970c3   KOSAKI Motohiro   memcg: add mm_vms...
71
  #include <trace/events/vmscan.h>
073219e99   Tejun Heo   cgroup: clean up ...
72
73
  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  EXPORT_SYMBOL(memory_cgrp_subsys);
68ae564bb   David Rientjes   mm, memcg: avoid ...
74

a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
75
  #define MEM_CGROUP_RECLAIM_RETRIES	5
6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
76
  static struct mem_cgroup *root_mem_cgroup __read_mostly;
8cdea7c05   Balbir Singh   Memory controller...
77

21afa38ee   Johannes Weiner   mm: memcontrol: c...
78
  /* Whether the swap controller is active */
c255a4580   Andrew Morton   memcg: rename con...
79
  #ifdef CONFIG_MEMCG_SWAP
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
80
  int do_swap_account __read_mostly;
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
81
  #else
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
82
  #define do_swap_account		0
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
83
  #endif
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
84
85
86
  static const char * const mem_cgroup_stat_names[] = {
  	"cache",
  	"rss",
b070e65c0   David Rientjes   mm, memcg: add rs...
87
  	"rss_huge",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
88
  	"mapped_file",
3ea67d06e   Sha Zhengju   memcg: add per cg...
89
  	"writeback",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
90
91
  	"swap",
  };
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
92
93
94
95
96
97
  static const char * const mem_cgroup_events_names[] = {
  	"pgpgin",
  	"pgpgout",
  	"pgfault",
  	"pgmajfault",
  };
58cf188ed   Sha Zhengju   memcg, oom: provi...
98
99
100
101
102
103
104
  static const char * const mem_cgroup_lru_names[] = {
  	"inactive_anon",
  	"active_anon",
  	"inactive_file",
  	"active_file",
  	"unevictable",
  };
7a159cc9d   Johannes Weiner   memcg: use native...
105
106
107
108
109
110
111
112
  /*
   * Per memcg event counter is incremented at every pagein/pageout. With THP,
   * it will be incremated by the number of pages. This counter is used for
   * for trigger some periodic events. This is straightforward and better
   * than using jiffies etc. to handle periodic memcg event.
   */
  enum mem_cgroup_events_target {
  	MEM_CGROUP_TARGET_THRESH,
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
113
  	MEM_CGROUP_TARGET_SOFTLIMIT,
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
114
  	MEM_CGROUP_TARGET_NUMAINFO,
7a159cc9d   Johannes Weiner   memcg: use native...
115
116
  	MEM_CGROUP_NTARGETS,
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
117
118
119
  #define THRESHOLDS_EVENTS_TARGET 128
  #define SOFTLIMIT_EVENTS_TARGET 1024
  #define NUMAINFO_EVENTS_TARGET	1024
e9f8974f2   Johannes Weiner   memcg: break out ...
120

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
121
  struct mem_cgroup_stat_cpu {
7a159cc9d   Johannes Weiner   memcg: use native...
122
  	long count[MEM_CGROUP_STAT_NSTATS];
241994ed8   Johannes Weiner   mm: memcontrol: d...
123
  	unsigned long events[MEMCG_NR_EVENTS];
13114716c   Johannes Weiner   mm: memcg: keep r...
124
  	unsigned long nr_page_events;
7a159cc9d   Johannes Weiner   memcg: use native...
125
  	unsigned long targets[MEM_CGROUP_NTARGETS];
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
126
  };
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
127
128
  struct reclaim_iter {
  	struct mem_cgroup *position;
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
129
130
131
  	/* scan generation, increased every round-trip */
  	unsigned int generation;
  };
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
132
  /*
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
133
134
   * per-zone information in memory controller.
   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
135
  struct mem_cgroup_per_zone {
6290df545   Johannes Weiner   mm: collect LRU l...
136
  	struct lruvec		lruvec;
1eb492725   Hugh Dickins   memcg: lru_size i...
137
  	unsigned long		lru_size[NR_LRU_LISTS];
3e2f41f1f   KOSAKI Motohiro   memcg: add zone_r...
138

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
139
  	struct reclaim_iter	iter[DEF_PRIORITY + 1];
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
140

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
141
  	struct rb_node		tree_node;	/* RB tree node */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
142
  	unsigned long		usage_in_excess;/* Set to the value by which */
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
143
144
  						/* the soft limit is exceeded*/
  	bool			on_tree;
d79154bb5   Hugh Dickins   memcg: replace me...
145
  	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
4e4169535   Balbir Singh   memory controller...
146
  						/* use container_of	   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
147
  };
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
148
149
150
151
  
  struct mem_cgroup_per_node {
  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
  /*
   * Cgroups above their limits are maintained in a RB-Tree, independent of
   * their hierarchy representation
   */
  
  struct mem_cgroup_tree_per_zone {
  	struct rb_root rb_root;
  	spinlock_t lock;
  };
  
  struct mem_cgroup_tree_per_node {
  	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_tree {
  	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
  };
  
  static struct mem_cgroup_tree soft_limit_tree __read_mostly;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
171
172
  struct mem_cgroup_threshold {
  	struct eventfd_ctx *eventfd;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
173
  	unsigned long threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
174
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
175
  /* For threshold */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
176
  struct mem_cgroup_threshold_ary {
748dad36d   Sha Zhengju   memcg: make thres...
177
  	/* An array index points to threshold just below or equal to usage. */
5407a5625   Phil Carmody   mm: remove unnece...
178
  	int current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
179
180
181
182
183
  	/* Size of entries[] */
  	unsigned int size;
  	/* Array of thresholds */
  	struct mem_cgroup_threshold entries[0];
  };
2c488db27   Kirill A. Shutemov   memcg: clean up m...
184
185
186
187
188
189
190
191
192
193
194
  
  struct mem_cgroup_thresholds {
  	/* Primary thresholds array */
  	struct mem_cgroup_threshold_ary *primary;
  	/*
  	 * Spare threshold array.
  	 * This is needed to make mem_cgroup_unregister_event() "never fail".
  	 * It must be able to store at least primary->size - 1 entries.
  	 */
  	struct mem_cgroup_threshold_ary *spare;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
195
196
197
198
199
  /* for OOM */
  struct mem_cgroup_eventfd_list {
  	struct list_head list;
  	struct eventfd_ctx *eventfd;
  };
2e72b6347   Kirill A. Shutemov   memcg: implement ...
200

79bd9814e   Tejun Heo   cgroup, memcg: mo...
201
202
203
  /*
   * cgroup_event represents events which userspace want to receive.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
204
  struct mem_cgroup_event {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
205
  	/*
59b6f8734   Tejun Heo   memcg: make cgrou...
206
  	 * memcg which the event belongs to.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
207
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
208
  	struct mem_cgroup *memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
209
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
210
211
212
213
214
215
216
217
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
218
219
220
221
  	 * register_event() callback will be used to add new userspace
  	 * waiter for changes related to this event.  Use eventfd_signal()
  	 * on eventfd to send notification to userspace.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
222
  	int (*register_event)(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
223
  			      struct eventfd_ctx *eventfd, const char *args);
fba948078   Tejun Heo   cgroup, memcg: mo...
224
225
226
227
228
  	/*
  	 * unregister_event() callback will be called when userspace closes
  	 * the eventfd or on cgroup removing.  This callback must be set,
  	 * if you want provide notification functionality.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
229
  	void (*unregister_event)(struct mem_cgroup *memcg,
fba948078   Tejun Heo   cgroup, memcg: mo...
230
231
  				 struct eventfd_ctx *eventfd);
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
232
233
234
235
236
237
238
239
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
c0ff4b854   Raghavendra K T   memcg: rename mem...
240
241
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
242

f64c3f549   Balbir Singh   memory controller...
243
  /*
8cdea7c05   Balbir Singh   Memory controller...
244
245
246
247
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
   * statistics based on the statistics developed by Rik Van Riel for clock-pro,
   * to help the administrator determine what knobs to tune.
8cdea7c05   Balbir Singh   Memory controller...
248
249
250
   */
  struct mem_cgroup {
  	struct cgroup_subsys_state css;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
251
252
253
254
255
  
  	/* Accounted resources */
  	struct page_counter memory;
  	struct page_counter memsw;
  	struct page_counter kmem;
241994ed8   Johannes Weiner   mm: memcontrol: d...
256
257
258
  	/* Normal memory consumption range */
  	unsigned long low;
  	unsigned long high;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
259
  	unsigned long soft_limit;
59927fb98   Hugh Dickins   memcg: free mem_c...
260

70ddf637e   Anton Vorontsov   memcg: add memory...
261
262
  	/* vmpressure notifications */
  	struct vmpressure vmpressure;
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
263
264
  	/* css_online() has been completed */
  	int initialized;
465939a1f   Li Zefan   memcg: don't need...
265
  	/*
18f59ea7d   Balbir Singh   memcg: memory cgr...
266
267
268
  	 * Should the accounting and control be hierarchical, per subtree?
  	 */
  	bool use_hierarchy;
79dfdaccd   Michal Hocko   memcg: make oom_l...
269
270
271
  
  	bool		oom_lock;
  	atomic_t	under_oom;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
272
  	atomic_t	oom_wakeups;
79dfdaccd   Michal Hocko   memcg: make oom_l...
273

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
274
  	int	swappiness;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
275
276
  	/* OOM-Killer disable */
  	int		oom_kill_disable;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
277

2e72b6347   Kirill A. Shutemov   memcg: implement ...
278
279
280
281
  	/* protect arrays of thresholds */
  	struct mutex thresholds_lock;
  
  	/* thresholds for memory usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
282
  	struct mem_cgroup_thresholds thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
283

2e72b6347   Kirill A. Shutemov   memcg: implement ...
284
  	/* thresholds for mem+swap usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
285
  	struct mem_cgroup_thresholds memsw_thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
286

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
287
288
  	/* For oom notifier event fd */
  	struct list_head oom_notify;
185efc0f9   Johannes Weiner   memcg: Revert "me...
289

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
290
  	/*
7dc74be03   Daisuke Nishimura   memcg: add interf...
291
292
293
  	 * Should we move charges of a task when a task is moved into this
  	 * mem_cgroup ? And what type of charges should we move ?
  	 */
f894ffa86   Andrew Morton   memcg: trivial cl...
294
  	unsigned long move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
295
  	/*
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
296
297
  	 * set > 0 if pages under this cgroup are moving to other cgroup.
  	 */
6de226191   Johannes Weiner   mm: memcontrol: t...
298
  	atomic_t		moving_account;
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
299
  	/* taken only while moving_account > 0 */
6de226191   Johannes Weiner   mm: memcontrol: t...
300
301
302
  	spinlock_t		move_lock;
  	struct task_struct	*move_lock_task;
  	unsigned long		move_lock_flags;
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
303
  	/*
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
304
  	 * percpu counter.
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
305
  	 */
3a7951b4c   Kirill A. Shutemov   memcg: mark stat ...
306
  	struct mem_cgroup_stat_cpu __percpu *stat;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
307
308
309
310
311
312
  	/*
  	 * used when a cpu is offlined or other synchronizations
  	 * See mem_cgroup_read_stat().
  	 */
  	struct mem_cgroup_stat_cpu nocpu_base;
  	spinlock_t pcp_counter_lock;
d1a4c0b37   Glauber Costa   tcp memory pressu...
313

4bd2c1ee4   Michal Hocko   memcg: cleanup km...
314
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
2e685cad5   Eric W. Biederman   tcp_memcontrol: K...
315
  	struct cg_proto tcp_mem;
d1a4c0b37   Glauber Costa   tcp memory pressu...
316
  #endif
2633d7a02   Glauber Costa   slab/slub: consid...
317
  #if defined(CONFIG_MEMCG_KMEM)
f7ce3190c   Vladimir Davydov   slab: embed memcg...
318
          /* Index in the kmem_cache->memcg_params.memcg_caches array */
2633d7a02   Glauber Costa   slab/slub: consid...
319
  	int kmemcg_id;
2788cf0c4   Vladimir Davydov   memcg: reparent l...
320
  	bool kmem_acct_activated;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
321
  	bool kmem_acct_active;
2633d7a02   Glauber Costa   slab/slub: consid...
322
  #endif
45cf7ebd5   Glauber Costa   memcg: reduce the...
323
324
325
326
327
328
329
  
  	int last_scanned_node;
  #if MAX_NUMNODES > 1
  	nodemask_t	scan_nodes;
  	atomic_t	numainfo_events;
  	atomic_t	numainfo_updating;
  #endif
70ddf637e   Anton Vorontsov   memcg: add memory...
330

fba948078   Tejun Heo   cgroup, memcg: mo...
331
332
333
  	/* List of events which userspace want to receive */
  	struct list_head event_list;
  	spinlock_t event_list_lock;
54f72fe02   Johannes Weiner   memcg: clean up m...
334
335
  	struct mem_cgroup_per_node *nodeinfo[0];
  	/* WARNING: nodeinfo must be the last member here */
8cdea7c05   Balbir Singh   Memory controller...
336
  };
510fc4e11   Glauber Costa   memcg: kmem accou...
337
  #ifdef CONFIG_MEMCG_KMEM
cb731d6c6   Vladimir Davydov   vmscan: per memor...
338
  bool memcg_kmem_is_active(struct mem_cgroup *memcg)
7de37682b   Glauber Costa   memcg: kmem accou...
339
  {
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
340
  	return memcg->kmem_acct_active;
7de37682b   Glauber Costa   memcg: kmem accou...
341
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
342
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
343
344
  /* Stuffs for move charges at task migration. */
  /*
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
345
   * Types of charges to be moved.
7dc74be03   Daisuke Nishimura   memcg: add interf...
346
   */
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
347
348
349
  #define MOVE_ANON	0x1U
  #define MOVE_FILE	0x2U
  #define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
7dc74be03   Daisuke Nishimura   memcg: add interf...
350

4ffef5fef   Daisuke Nishimura   memcg: move charg...
351
352
  /* "mc" and its members are protected by cgroup_mutex */
  static struct move_charge_struct {
b1dd693e5   Daisuke Nishimura   memcg: avoid dead...
353
  	spinlock_t	  lock; /* for from, to */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
354
355
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
356
  	unsigned long flags;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
357
  	unsigned long precharge;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
358
  	unsigned long moved_charge;
483c30b51   Daisuke Nishimura   memcg: improve pe...
359
  	unsigned long moved_swap;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
360
361
362
  	struct task_struct *moving_task;	/* a task moving charges */
  	wait_queue_head_t waitq;		/* a waitq for other context */
  } mc = {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
363
  	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
364
365
  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
366

4e4169535   Balbir Singh   memory controller...
367
368
369
370
  /*
   * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
371
  #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
372
  #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
4e4169535   Balbir Singh   memory controller...
373

217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
374
375
  enum charge_type {
  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
376
  	MEM_CGROUP_CHARGE_TYPE_ANON,
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
377
  	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
378
  	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
379
380
  	NR_CHARGE_TYPE,
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
381
  /* for encoding cft->private value on file */
86ae53e1a   Glauber Costa   memcg: change def...
382
383
384
385
  enum res_type {
  	_MEM,
  	_MEMSWAP,
  	_OOM_TYPE,
510fc4e11   Glauber Costa   memcg: kmem accou...
386
  	_KMEM,
86ae53e1a   Glauber Costa   memcg: change def...
387
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
388
389
  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
390
  #define MEMFILE_ATTR(val)	((val) & 0xffff)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
391
392
  /* Used for OOM nofiier */
  #define OOM_CONTROL		(0)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
393

75822b449   Balbir Singh   memory controller...
394
  /*
0999821b1   Glauber Costa   memcg: replace cg...
395
396
397
398
399
   * The memcg_create_mutex will be held whenever a new cgroup is created.
   * As a consequence, any change that needs to protect against new child cgroups
   * appearing has to hold it as well.
   */
  static DEFINE_MUTEX(memcg_create_mutex);
b21451459   Wanpeng Li   memcg: add mem_cg...
400
401
  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
  {
a7c6d554a   Tejun Heo   cgroup: add/updat...
402
  	return s ? container_of(s, struct mem_cgroup, css) : NULL;
b21451459   Wanpeng Li   memcg: add mem_cg...
403
  }
70ddf637e   Anton Vorontsov   memcg: add memory...
404
405
406
407
408
409
410
411
412
413
414
415
  /* Some nice accessors for the vmpressure. */
  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
  {
  	if (!memcg)
  		memcg = root_mem_cgroup;
  	return &memcg->vmpressure;
  }
  
  struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
  {
  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
  }
7ffc0edc4   Michal Hocko   memcg: move mem_c...
416
417
418
419
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
  	return (memcg == root_mem_cgroup);
  }
4219b2da2   Li Zefan   memcg: fail to cr...
420
421
422
423
424
  /*
   * We restrict the id in the range of [1, 65535], so it can fit into
   * an unsigned short.
   */
  #define MEM_CGROUP_ID_MAX	USHRT_MAX
34c00c319   Li Zefan   memcg: convert to...
425
426
  static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
  {
15a4c835e   Tejun Heo   cgroup, memcg: im...
427
  	return memcg->css.id;
34c00c319   Li Zefan   memcg: convert to...
428
  }
adbe427b9   Vladimir Davydov   memcg: zap mem_cg...
429
430
431
432
433
434
  /*
   * A helper function to get mem_cgroup from ID. must be called under
   * rcu_read_lock().  The caller is responsible for calling
   * css_tryget_online() if the mem_cgroup is used for charging. (dropping
   * refcnt from swap can be called against removed memcg.)
   */
34c00c319   Li Zefan   memcg: convert to...
435
436
437
  static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
  {
  	struct cgroup_subsys_state *css;
7d699ddb2   Tejun Heo   cgroup, memcg: al...
438
  	css = css_from_id(id, &memory_cgrp_subsys);
34c00c319   Li Zefan   memcg: convert to...
439
440
  	return mem_cgroup_from_css(css);
  }
e1aab161e   Glauber Costa   socket: initial c...
441
  /* Writing them here to avoid exposing memcg's inner layout */
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
442
  #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
e1aab161e   Glauber Costa   socket: initial c...
443

e1aab161e   Glauber Costa   socket: initial c...
444
445
  void sock_update_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
446
  	if (mem_cgroup_sockets_enabled) {
e1aab161e   Glauber Costa   socket: initial c...
447
  		struct mem_cgroup *memcg;
3f1346193   Glauber Costa   memcg: decrement ...
448
  		struct cg_proto *cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
449
450
  
  		BUG_ON(!sk->sk_prot->proto_cgroup);
f3f511e1c   Glauber Costa   net: fix sock_clo...
451
452
453
454
455
456
457
458
459
460
  		/* Socket cloning can throw us here with sk_cgrp already
  		 * filled. It won't however, necessarily happen from
  		 * process context. So the test for root memcg given
  		 * the current task's memcg won't help us in this case.
  		 *
  		 * Respecting the original socket's memcg is a better
  		 * decision in this case.
  		 */
  		if (sk->sk_cgrp) {
  			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
5347e5ae1   Li Zefan   memcg: use css_ge...
461
  			css_get(&sk->sk_cgrp->memcg->css);
f3f511e1c   Glauber Costa   net: fix sock_clo...
462
463
  			return;
  		}
e1aab161e   Glauber Costa   socket: initial c...
464
465
  		rcu_read_lock();
  		memcg = mem_cgroup_from_task(current);
3f1346193   Glauber Costa   memcg: decrement ...
466
  		cg_proto = sk->sk_prot->proto_cgroup(memcg);
5347e5ae1   Li Zefan   memcg: use css_ge...
467
  		if (!mem_cgroup_is_root(memcg) &&
ec903c0c8   Tejun Heo   cgroup: rename cs...
468
469
  		    memcg_proto_active(cg_proto) &&
  		    css_tryget_online(&memcg->css)) {
3f1346193   Glauber Costa   memcg: decrement ...
470
  			sk->sk_cgrp = cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
471
472
473
474
475
476
477
478
  		}
  		rcu_read_unlock();
  	}
  }
  EXPORT_SYMBOL(sock_update_memcg);
  
  void sock_release_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
479
  	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
e1aab161e   Glauber Costa   socket: initial c...
480
481
482
  		struct mem_cgroup *memcg;
  		WARN_ON(!sk->sk_cgrp->memcg);
  		memcg = sk->sk_cgrp->memcg;
5347e5ae1   Li Zefan   memcg: use css_ge...
483
  		css_put(&sk->sk_cgrp->memcg->css);
e1aab161e   Glauber Costa   socket: initial c...
484
485
  	}
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
486
487
488
489
490
  
  struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
  {
  	if (!memcg || mem_cgroup_is_root(memcg))
  		return NULL;
2e685cad5   Eric W. Biederman   tcp_memcontrol: K...
491
  	return &memcg->tcp_mem;
d1a4c0b37   Glauber Costa   tcp memory pressu...
492
493
  }
  EXPORT_SYMBOL(tcp_proto_cgroup);
e1aab161e   Glauber Costa   socket: initial c...
494

3f1346193   Glauber Costa   memcg: decrement ...
495
  #endif
a8964b9b8   Glauber Costa   memcg: use static...
496
  #ifdef CONFIG_MEMCG_KMEM
55007d849   Glauber Costa   memcg: allocate m...
497
  /*
f7ce3190c   Vladimir Davydov   slab: embed memcg...
498
   * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
b86278359   Li Zefan   memcg: stop using...
499
500
501
502
503
   * The main reason for not using cgroup id for this:
   *  this works better in sparse environments, where we have a lot of memcgs,
   *  but only a few kmem-limited. Or also, if we have, for instance, 200
   *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
   *  200 entry array for that.
55007d849   Glauber Costa   memcg: allocate m...
504
   *
dbcf73e26   Vladimir Davydov   memcg: rename som...
505
506
   * The current size of the caches array is stored in memcg_nr_cache_ids. It
   * will double each time we have to increase it.
55007d849   Glauber Costa   memcg: allocate m...
507
   */
dbcf73e26   Vladimir Davydov   memcg: rename som...
508
509
  static DEFINE_IDA(memcg_cache_ida);
  int memcg_nr_cache_ids;
749c54151   Glauber Costa   memcg: aggregate ...
510

05257a1a3   Vladimir Davydov   memcg: add rwsem ...
511
512
513
514
515
516
517
518
519
520
521
522
  /* Protects memcg_nr_cache_ids */
  static DECLARE_RWSEM(memcg_cache_ids_sem);
  
  void memcg_get_cache_ids(void)
  {
  	down_read(&memcg_cache_ids_sem);
  }
  
  void memcg_put_cache_ids(void)
  {
  	up_read(&memcg_cache_ids_sem);
  }
55007d849   Glauber Costa   memcg: allocate m...
523
524
525
526
527
528
  /*
   * MIN_SIZE is different than 1, because we would like to avoid going through
   * the alloc/free process all the time. In a small machine, 4 kmem-limited
   * cgroups is a reasonable guess. In the future, it could be a parameter or
   * tunable, but that is strictly not necessary.
   *
b86278359   Li Zefan   memcg: stop using...
529
   * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
55007d849   Glauber Costa   memcg: allocate m...
530
531
   * this constant directly from cgroup, but it is understandable that this is
   * better kept as an internal representation in cgroup.c. In any case, the
b86278359   Li Zefan   memcg: stop using...
532
   * cgrp_id space is not getting any smaller, and we don't have to necessarily
55007d849   Glauber Costa   memcg: allocate m...
533
534
535
   * increase ours as well if it increases.
   */
  #define MEMCG_CACHES_MIN_SIZE 4
b86278359   Li Zefan   memcg: stop using...
536
  #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
55007d849   Glauber Costa   memcg: allocate m...
537

d7f25f8a2   Glauber Costa   memcg: infrastruc...
538
539
540
541
542
543
  /*
   * A lot of the calls to the cache allocation functions are expected to be
   * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
   * conditional to this static branch, we'll have to allow modules that does
   * kmem_cache_alloc and the such to see this symbol as well
   */
a8964b9b8   Glauber Costa   memcg: use static...
544
  struct static_key memcg_kmem_enabled_key;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
545
  EXPORT_SYMBOL(memcg_kmem_enabled_key);
a8964b9b8   Glauber Costa   memcg: use static...
546

a8964b9b8   Glauber Costa   memcg: use static...
547
  #endif /* CONFIG_MEMCG_KMEM */
f64c3f549   Balbir Singh   memory controller...
548
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
549
  mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
f64c3f549   Balbir Singh   memory controller...
550
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
551
552
  	int nid = zone_to_nid(zone);
  	int zid = zone_idx(zone);
54f72fe02   Johannes Weiner   memcg: clean up m...
553
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
554
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
555
  struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
d324236b3   Wu Fengguang   memcg: add access...
556
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
557
  	return &memcg->css;
d324236b3   Wu Fengguang   memcg: add access...
558
  }
f64c3f549   Balbir Singh   memory controller...
559
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
560
  mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
f64c3f549   Balbir Singh   memory controller...
561
  {
97a6c37b3   Johannes Weiner   memcg: change pag...
562
563
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
f64c3f549   Balbir Singh   memory controller...
564

e231875ba   Jianyu Zhan   mm: memcontrol: c...
565
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
566
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
567
568
569
570
571
572
573
574
575
576
577
578
579
580
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_node_zone(int nid, int zid)
  {
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_from_page(struct page *page)
  {
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
  
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
581
582
  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
583
  					 unsigned long new_usage_in_excess)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
  {
  	struct rb_node **p = &mctz->rb_root.rb_node;
  	struct rb_node *parent = NULL;
  	struct mem_cgroup_per_zone *mz_node;
  
  	if (mz->on_tree)
  		return;
  
  	mz->usage_in_excess = new_usage_in_excess;
  	if (!mz->usage_in_excess)
  		return;
  	while (*p) {
  		parent = *p;
  		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
  					tree_node);
  		if (mz->usage_in_excess < mz_node->usage_in_excess)
  			p = &(*p)->rb_left;
  		/*
  		 * We can't avoid mem cgroups that are over their soft
  		 * limit by the same amount
  		 */
  		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
  			p = &(*p)->rb_right;
  	}
  	rb_link_node(&mz->tree_node, parent, p);
  	rb_insert_color(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = true;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
612
613
  static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
614
615
616
617
618
619
  {
  	if (!mz->on_tree)
  		return;
  	rb_erase(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = false;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
620
621
  static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  				       struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
622
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
623
624
625
  	unsigned long flags;
  
  	spin_lock_irqsave(&mctz->lock, flags);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
626
  	__mem_cgroup_remove_exceeded(mz, mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
627
  	spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
628
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
629
630
631
  static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
  {
  	unsigned long nr_pages = page_counter_read(&memcg->memory);
4db0c3c29   Jason Low   mm: remove rest o...
632
  	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
633
634
635
636
637
638
639
  	unsigned long excess = 0;
  
  	if (nr_pages > soft_limit)
  		excess = nr_pages - soft_limit;
  
  	return excess;
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
640
641
642
  
  static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
643
  	unsigned long excess;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
644
645
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
646

e231875ba   Jianyu Zhan   mm: memcontrol: c...
647
  	mctz = soft_limit_tree_from_page(page);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
648
649
650
651
652
  	/*
  	 * Necessary to update all ancestors when hierarchy is used.
  	 * because their event counter is not touched.
  	 */
  	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
653
  		mz = mem_cgroup_page_zoneinfo(memcg, page);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
654
  		excess = soft_limit_excess(memcg);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
655
656
657
658
659
  		/*
  		 * We have to update the tree if mz is on RB-tree or
  		 * mem is over its softlimit.
  		 */
  		if (excess || mz->on_tree) {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
660
661
662
  			unsigned long flags;
  
  			spin_lock_irqsave(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
663
664
  			/* if on-tree, remove it */
  			if (mz->on_tree)
cf2c81279   Johannes Weiner   mm: memcontrol: r...
665
  				__mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
666
667
668
669
  			/*
  			 * Insert again. mz->usage_in_excess will be updated.
  			 * If excess is 0, no tree ops.
  			 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
670
  			__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
671
  			spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
672
673
674
675
676
677
  		}
  	}
  }
  
  static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
  {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
678
  	struct mem_cgroup_tree_per_zone *mctz;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
679
680
  	struct mem_cgroup_per_zone *mz;
  	int nid, zid;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
681

e231875ba   Jianyu Zhan   mm: memcontrol: c...
682
683
684
685
  	for_each_node(nid) {
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			mctz = soft_limit_tree_node_zone(nid, zid);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
686
  			mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
  		}
  	}
  }
  
  static struct mem_cgroup_per_zone *
  __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct rb_node *rightmost = NULL;
  	struct mem_cgroup_per_zone *mz;
  
  retry:
  	mz = NULL;
  	rightmost = rb_last(&mctz->rb_root);
  	if (!rightmost)
  		goto done;		/* Nothing to reclaim from */
  
  	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
  	/*
  	 * Remove the node now but someone else can add it back,
  	 * we will to add it back at the end of reclaim to its correct
  	 * position in the tree.
  	 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
709
  	__mem_cgroup_remove_exceeded(mz, mctz);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
710
  	if (!soft_limit_excess(mz->memcg) ||
ec903c0c8   Tejun Heo   cgroup: rename cs...
711
  	    !css_tryget_online(&mz->memcg->css))
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
712
713
714
715
716
717
718
719
720
  		goto retry;
  done:
  	return mz;
  }
  
  static struct mem_cgroup_per_zone *
  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct mem_cgroup_per_zone *mz;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
721
  	spin_lock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
722
  	mz = __mem_cgroup_largest_soft_limit_node(mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
723
  	spin_unlock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
724
725
  	return mz;
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
  /*
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
   * a periodic synchronizion of counter in memcg's counter.
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
   * he accounts memory. Even if we provide quick-and-fuzzy read, we always
   * have to visit all online cpus and make sum. So, for now, unnecessary
   * synchronization is not implemented. (just implemented for cpu hotplug)
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
   * common workload, threashold and synchonization as vmstat[] should be
   * implemented.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
745
  static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
7a159cc9d   Johannes Weiner   memcg: use native...
746
  				 enum mem_cgroup_stat_index idx)
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
747
  {
7a159cc9d   Johannes Weiner   memcg: use native...
748
  	long val = 0;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
749
  	int cpu;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
750

711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
751
752
  	get_online_cpus();
  	for_each_online_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
753
  		val += per_cpu(memcg->stat->count[idx], cpu);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
754
  #ifdef CONFIG_HOTPLUG_CPU
c0ff4b854   Raghavendra K T   memcg: rename mem...
755
756
757
  	spin_lock(&memcg->pcp_counter_lock);
  	val += memcg->nocpu_base.count[idx];
  	spin_unlock(&memcg->pcp_counter_lock);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
758
759
  #endif
  	put_online_cpus();
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
760
761
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
762
  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
e9f8974f2   Johannes Weiner   memcg: break out ...
763
764
765
766
  					    enum mem_cgroup_events_index idx)
  {
  	unsigned long val = 0;
  	int cpu;
9c5675127   David Rientjes   mm, memcg: protec...
767
  	get_online_cpus();
e9f8974f2   Johannes Weiner   memcg: break out ...
768
  	for_each_online_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
769
  		val += per_cpu(memcg->stat->events[idx], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
770
  #ifdef CONFIG_HOTPLUG_CPU
c0ff4b854   Raghavendra K T   memcg: rename mem...
771
772
773
  	spin_lock(&memcg->pcp_counter_lock);
  	val += memcg->nocpu_base.events[idx];
  	spin_unlock(&memcg->pcp_counter_lock);
e9f8974f2   Johannes Weiner   memcg: break out ...
774
  #endif
9c5675127   David Rientjes   mm, memcg: protec...
775
  	put_online_cpus();
e9f8974f2   Johannes Weiner   memcg: break out ...
776
777
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
778
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
b070e65c0   David Rientjes   mm, memcg: add rs...
779
  					 struct page *page,
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
780
  					 int nr_pages)
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
781
  {
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
782
783
784
785
  	/*
  	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
  	 * counted as CACHE even if it's on ANON LRU.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
786
  	if (PageAnon(page))
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
787
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
c0ff4b854   Raghavendra K T   memcg: rename mem...
788
  				nr_pages);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
789
  	else
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
790
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
c0ff4b854   Raghavendra K T   memcg: rename mem...
791
  				nr_pages);
55e462b05   Balaji Rao   memcg: simple sta...
792

b070e65c0   David Rientjes   mm, memcg: add rs...
793
794
795
  	if (PageTransHuge(page))
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
  				nr_pages);
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
796
797
  	/* pagein of a big page is an event. So, ignore page size */
  	if (nr_pages > 0)
c0ff4b854   Raghavendra K T   memcg: rename mem...
798
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
799
  	else {
c0ff4b854   Raghavendra K T   memcg: rename mem...
800
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
801
802
  		nr_pages = -nr_pages; /* for event */
  	}
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
803

13114716c   Johannes Weiner   mm: memcg: keep r...
804
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
805
  }
e231875ba   Jianyu Zhan   mm: memcontrol: c...
806
  unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
074291fea   Konstantin Khlebnikov   mm/vmscan: replac...
807
808
809
810
811
812
  {
  	struct mem_cgroup_per_zone *mz;
  
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	return mz->lru_size[lru];
  }
e231875ba   Jianyu Zhan   mm: memcontrol: c...
813
814
815
  static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
  						  int nid,
  						  unsigned int lru_mask)
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
816
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
817
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
818
  	int zid;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
819
  	VM_BUG_ON((unsigned)nid >= nr_node_ids);
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
820

e231875ba   Jianyu Zhan   mm: memcontrol: c...
821
822
823
824
825
826
827
828
829
830
831
832
  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  		struct mem_cgroup_per_zone *mz;
  		enum lru_list lru;
  
  		for_each_lru(lru) {
  			if (!(BIT(lru) & lru_mask))
  				continue;
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			nr += mz->lru_size[lru];
  		}
  	}
  	return nr;
889976dbc   Ying Han   memcg: reclaim me...
833
  }
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
834

c0ff4b854   Raghavendra K T   memcg: rename mem...
835
  static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
836
  			unsigned int lru_mask)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
837
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
838
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
839
  	int nid;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
840

31aaea4aa   Lai Jiangshan   memcontrol: use N...
841
  	for_each_node_state(nid, N_MEMORY)
e231875ba   Jianyu Zhan   mm: memcontrol: c...
842
843
  		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
  	return nr;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
844
  }
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
845
846
  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  				       enum mem_cgroup_events_target target)
7a159cc9d   Johannes Weiner   memcg: use native...
847
848
  {
  	unsigned long val, next;
13114716c   Johannes Weiner   mm: memcg: keep r...
849
  	val = __this_cpu_read(memcg->stat->nr_page_events);
4799401fe   Steven Rostedt   memcg: Fix race c...
850
  	next = __this_cpu_read(memcg->stat->targets[target]);
7a159cc9d   Johannes Weiner   memcg: use native...
851
  	/* from time_after() in jiffies.h */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
852
853
854
855
856
  	if ((long)next - (long)val < 0) {
  		switch (target) {
  		case MEM_CGROUP_TARGET_THRESH:
  			next = val + THRESHOLDS_EVENTS_TARGET;
  			break;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
857
858
859
  		case MEM_CGROUP_TARGET_SOFTLIMIT:
  			next = val + SOFTLIMIT_EVENTS_TARGET;
  			break;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
860
861
862
863
864
865
866
867
  		case MEM_CGROUP_TARGET_NUMAINFO:
  			next = val + NUMAINFO_EVENTS_TARGET;
  			break;
  		default:
  			break;
  		}
  		__this_cpu_write(memcg->stat->targets[target], next);
  		return true;
7a159cc9d   Johannes Weiner   memcg: use native...
868
  	}
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
869
  	return false;
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
870
871
872
873
874
875
  }
  
  /*
   * Check events in order.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
876
  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
877
878
  {
  	/* threshold event is triggered in finer grain than soft limit */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
879
880
  	if (unlikely(mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_THRESH))) {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
881
  		bool do_softlimit;
82b3f2a71   Andrew Morton   mm/memcontrol.c: ...
882
  		bool do_numainfo __maybe_unused;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
883

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
884
885
  		do_softlimit = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_SOFTLIMIT);
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
886
887
888
889
  #if MAX_NUMNODES > 1
  		do_numainfo = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_NUMAINFO);
  #endif
c0ff4b854   Raghavendra K T   memcg: rename mem...
890
  		mem_cgroup_threshold(memcg);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
891
892
  		if (unlikely(do_softlimit))
  			mem_cgroup_update_tree(memcg, page);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
893
  #if MAX_NUMNODES > 1
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
894
  		if (unlikely(do_numainfo))
c0ff4b854   Raghavendra K T   memcg: rename mem...
895
  			atomic_inc(&memcg->numainfo_events);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
896
  #endif
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
897
  	}
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
898
  }
cf475ad28   Balbir Singh   cgroups: add an o...
899
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb74669   Pavel Emelianov   Memory controller...
900
  {
31a78f23b   Balbir Singh   mm owner: fix rac...
901
902
903
904
905
906
907
  	/*
  	 * mm_update_next_owner() may clear mm->owner to NULL
  	 * if it races with swapoff, page migration, etc.
  	 * So this can be called with p == NULL.
  	 */
  	if (unlikely(!p))
  		return NULL;
073219e99   Tejun Heo   cgroup: clean up ...
908
  	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
78fb74669   Pavel Emelianov   Memory controller...
909
  }
df3819754   Johannes Weiner   memcg: get_mem_cg...
910
  static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
911
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
912
  	struct mem_cgroup *memcg = NULL;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
913

54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
914
915
  	rcu_read_lock();
  	do {
6f6acb005   Michal Hocko   memcg: fix swapca...
916
917
918
919
920
921
  		/*
  		 * Page cache insertions can happen withou an
  		 * actual mm context, e.g. during disk probing
  		 * on boot, loopback IO, acct() writes etc.
  		 */
  		if (unlikely(!mm))
df3819754   Johannes Weiner   memcg: get_mem_cg...
922
  			memcg = root_mem_cgroup;
6f6acb005   Michal Hocko   memcg: fix swapca...
923
924
925
926
927
  		else {
  			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  			if (unlikely(!memcg))
  				memcg = root_mem_cgroup;
  		}
ec903c0c8   Tejun Heo   cgroup: rename cs...
928
  	} while (!css_tryget_online(&memcg->css));
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
929
  	rcu_read_unlock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
930
  	return memcg;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
931
  }
5660048cc   Johannes Weiner   mm: move memcg hi...
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
  /**
   * mem_cgroup_iter - iterate over memory cgroup hierarchy
   * @root: hierarchy root
   * @prev: previously returned memcg, NULL on first invocation
   * @reclaim: cookie for shared reclaim walks, NULL for full walks
   *
   * Returns references to children of the hierarchy below @root, or
   * @root itself, or %NULL after a full round-trip.
   *
   * Caller must pass the return value in @prev on subsequent
   * invocations for reference counting, or use mem_cgroup_iter_break()
   * to cancel a hierarchy walk before the round-trip is complete.
   *
   * Reclaimers can specify a zone and a priority level in @reclaim to
   * divide up the memcgs in the hierarchy among all concurrent
   * reclaimers operating on the same zone and priority.
   */
694fbc0fe   Andrew Morton   revert "memcg: en...
949
  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
5660048cc   Johannes Weiner   mm: move memcg hi...
950
  				   struct mem_cgroup *prev,
694fbc0fe   Andrew Morton   revert "memcg: en...
951
  				   struct mem_cgroup_reclaim_cookie *reclaim)
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
952
  {
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
953
954
  	struct reclaim_iter *uninitialized_var(iter);
  	struct cgroup_subsys_state *css = NULL;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
955
  	struct mem_cgroup *memcg = NULL;
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
956
  	struct mem_cgroup *pos = NULL;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
957

694fbc0fe   Andrew Morton   revert "memcg: en...
958
959
  	if (mem_cgroup_disabled())
  		return NULL;
5660048cc   Johannes Weiner   mm: move memcg hi...
960

9f3a0d093   Johannes Weiner   mm: memcg: consol...
961
962
  	if (!root)
  		root = root_mem_cgroup;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
963

9f3a0d093   Johannes Weiner   mm: memcg: consol...
964
  	if (prev && !reclaim)
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
965
  		pos = prev;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
966

9f3a0d093   Johannes Weiner   mm: memcg: consol...
967
968
  	if (!root->use_hierarchy && root != root_mem_cgroup) {
  		if (prev)
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
969
  			goto out;
694fbc0fe   Andrew Morton   revert "memcg: en...
970
  		return root;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
971
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
972

542f85f9a   Michal Hocko   memcg: rework mem...
973
  	rcu_read_lock();
5f5781619   Michal Hocko   memcg: relax memc...
974

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
975
976
977
978
979
980
981
982
983
984
  	if (reclaim) {
  		struct mem_cgroup_per_zone *mz;
  
  		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
  		iter = &mz->iter[reclaim->priority];
  
  		if (prev && reclaim->generation != iter->generation)
  			goto out_unlock;
  
  		do {
4db0c3c29   Jason Low   mm: remove rest o...
985
  			pos = READ_ONCE(iter->position);
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
  			/*
  			 * A racing update may change the position and
  			 * put the last reference, hence css_tryget(),
  			 * or retry to see the updated position.
  			 */
  		} while (pos && !css_tryget(&pos->css));
  	}
  
  	if (pos)
  		css = &pos->css;
  
  	for (;;) {
  		css = css_next_descendant_pre(css, &root->css);
  		if (!css) {
  			/*
  			 * Reclaimers share the hierarchy walk, and a
  			 * new one might jump in right at the end of
  			 * the hierarchy - make sure they see at least
  			 * one group and restart from the beginning.
  			 */
  			if (!prev)
  				continue;
  			break;
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1009
  		}
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1010

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1011
1012
1013
1014
1015
1016
  		/*
  		 * Verify the css and acquire a reference.  The root
  		 * is provided by the caller, so we know it's alive
  		 * and kicking, and don't take an extra reference.
  		 */
  		memcg = mem_cgroup_from_css(css);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1017

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1018
1019
  		if (css == &root->css)
  			break;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1020

b2052564e   Johannes Weiner   mm: memcontrol: c...
1021
  		if (css_tryget(css)) {
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1022
1023
1024
1025
1026
1027
1028
  			/*
  			 * Make sure the memcg is initialized:
  			 * mem_cgroup_css_online() orders the the
  			 * initialization against setting the flag.
  			 */
  			if (smp_load_acquire(&memcg->initialized))
  				break;
542f85f9a   Michal Hocko   memcg: rework mem...
1029

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1030
  			css_put(css);
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1031
  		}
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1032

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1033
  		memcg = NULL;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1034
  	}
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
  
  	if (reclaim) {
  		if (cmpxchg(&iter->position, pos, memcg) == pos) {
  			if (memcg)
  				css_get(&memcg->css);
  			if (pos)
  				css_put(&pos->css);
  		}
  
  		/*
  		 * pairs with css_tryget when dereferencing iter->position
  		 * above.
  		 */
  		if (pos)
  			css_put(&pos->css);
  
  		if (!memcg)
  			iter->generation++;
  		else if (!prev)
  			reclaim->generation = iter->generation;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1055
  	}
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1056

542f85f9a   Michal Hocko   memcg: rework mem...
1057
1058
  out_unlock:
  	rcu_read_unlock();
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1059
  out:
c40046f3a   Michal Hocko   memcg: keep prev'...
1060
1061
  	if (prev && prev != root)
  		css_put(&prev->css);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1062
  	return memcg;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1063
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1064

5660048cc   Johannes Weiner   mm: move memcg hi...
1065
1066
1067
1068
1069
1070
1071
  /**
   * mem_cgroup_iter_break - abort a hierarchy walk prematurely
   * @root: hierarchy root
   * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
   */
  void mem_cgroup_iter_break(struct mem_cgroup *root,
  			   struct mem_cgroup *prev)
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1072
1073
1074
1075
1076
1077
  {
  	if (!root)
  		root = root_mem_cgroup;
  	if (prev && prev != root)
  		css_put(&prev->css);
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1078

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1079
1080
1081
1082
1083
1084
  /*
   * Iteration constructs for visiting all cgroups (under a tree).  If
   * loops are exited prematurely (break), mem_cgroup_iter_break() must
   * be used for reference counting.
   */
  #define for_each_mem_cgroup_tree(iter, root)		\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1085
  	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1086
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1087
  	     iter = mem_cgroup_iter(root, iter, NULL))
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1088

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1089
  #define for_each_mem_cgroup(iter)			\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1090
  	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1091
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1092
  	     iter = mem_cgroup_iter(NULL, iter, NULL))
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1093

68ae564bb   David Rientjes   mm, memcg: avoid ...
1094
  void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
456f998ec   Ying Han   memcg: add the pa...
1095
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1096
  	struct mem_cgroup *memcg;
456f998ec   Ying Han   memcg: add the pa...
1097

456f998ec   Ying Han   memcg: add the pa...
1098
  	rcu_read_lock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1099
1100
  	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  	if (unlikely(!memcg))
456f998ec   Ying Han   memcg: add the pa...
1101
1102
1103
  		goto out;
  
  	switch (idx) {
456f998ec   Ying Han   memcg: add the pa...
1104
  	case PGFAULT:
0e574a932   Johannes Weiner   mm: memcg: clean ...
1105
1106
1107
1108
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
  		break;
  	case PGMAJFAULT:
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
456f998ec   Ying Han   memcg: add the pa...
1109
1110
1111
1112
1113
1114
1115
  		break;
  	default:
  		BUG();
  	}
  out:
  	rcu_read_unlock();
  }
68ae564bb   David Rientjes   mm, memcg: avoid ...
1116
  EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
456f998ec   Ying Han   memcg: add the pa...
1117

925b7673c   Johannes Weiner   mm: make per-memc...
1118
1119
1120
  /**
   * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
   * @zone: zone of the wanted lruvec
fa9add641   Hugh Dickins   mm/memcg: apply a...
1121
   * @memcg: memcg of the wanted lruvec
925b7673c   Johannes Weiner   mm: make per-memc...
1122
1123
1124
1125
1126
1127
1128
1129
1130
   *
   * Returns the lru list vector holding pages for the given @zone and
   * @mem.  This can be the global zone lruvec, if the memory controller
   * is disabled.
   */
  struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
  				      struct mem_cgroup *memcg)
  {
  	struct mem_cgroup_per_zone *mz;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1131
  	struct lruvec *lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
1132

bea8c150a   Hugh Dickins   memcg: fix hotplu...
1133
1134
1135
1136
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
1137

e231875ba   Jianyu Zhan   mm: memcontrol: c...
1138
  	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
1149
  }
925b7673c   Johannes Weiner   mm: make per-memc...
1150
  /**
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
1151
   * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
925b7673c   Johannes Weiner   mm: make per-memc...
1152
   * @page: the page
fa9add641   Hugh Dickins   mm/memcg: apply a...
1153
   * @zone: zone of the page
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
1154
1155
1156
1157
   *
   * This function is only safe when following the LRU page isolation
   * and putback protocol: the LRU lock must be held, and the page must
   * either be PageLRU() or the caller must have isolated/allocated it.
925b7673c   Johannes Weiner   mm: make per-memc...
1158
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1159
  struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1160
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1161
  	struct mem_cgroup_per_zone *mz;
925b7673c   Johannes Weiner   mm: make per-memc...
1162
  	struct mem_cgroup *memcg;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1163
  	struct lruvec *lruvec;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
1164

bea8c150a   Hugh Dickins   memcg: fix hotplu...
1165
1166
1167
1168
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
1169

1306a85ae   Johannes Weiner   mm: embed the mem...
1170
  	memcg = page->mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1171
  	/*
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
1172
  	 * Swapcache readahead pages are added to the LRU - and
298333157   Johannes Weiner   mm: memcontrol: r...
1173
  	 * possibly migrated - before they are charged.
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1174
  	 */
298333157   Johannes Weiner   mm: memcontrol: r...
1175
1176
  	if (!memcg)
  		memcg = root_mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1177

e231875ba   Jianyu Zhan   mm: memcontrol: c...
1178
  	mz = mem_cgroup_page_zoneinfo(memcg, page);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1189
  }
b69408e88   Christoph Lameter   vmscan: Use an in...
1190

925b7673c   Johannes Weiner   mm: make per-memc...
1191
  /**
fa9add641   Hugh Dickins   mm/memcg: apply a...
1192
1193
1194
1195
   * mem_cgroup_update_lru_size - account for adding or removing an lru page
   * @lruvec: mem_cgroup per zone lru vector
   * @lru: index of lru list the page is sitting on
   * @nr_pages: positive when adding or negative when removing
925b7673c   Johannes Weiner   mm: make per-memc...
1196
   *
fa9add641   Hugh Dickins   mm/memcg: apply a...
1197
1198
   * This function must be called when a page is added to or removed from an
   * lru list.
3f58a8294   Minchan Kim   memcg: move memcg...
1199
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1200
1201
  void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
  				int nr_pages)
3f58a8294   Minchan Kim   memcg: move memcg...
1202
1203
  {
  	struct mem_cgroup_per_zone *mz;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1204
  	unsigned long *lru_size;
3f58a8294   Minchan Kim   memcg: move memcg...
1205
1206
1207
  
  	if (mem_cgroup_disabled())
  		return;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1208
1209
1210
1211
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	lru_size = mz->lru_size + lru;
  	*lru_size += nr_pages;
  	VM_BUG_ON((long)(*lru_size) < 0);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1212
  }
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
1213

2314b42db   Johannes Weiner   mm: memcontrol: d...
1214
  bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
3e92041d6   Michal Hocko   memcg: add mem_cg...
1215
  {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1216
  	if (root == memcg)
91c63734f   Johannes Weiner   kernel: cgroup: p...
1217
  		return true;
2314b42db   Johannes Weiner   mm: memcontrol: d...
1218
  	if (!root->use_hierarchy)
91c63734f   Johannes Weiner   kernel: cgroup: p...
1219
  		return false;
2314b42db   Johannes Weiner   mm: memcontrol: d...
1220
  	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1221
  }
2314b42db   Johannes Weiner   mm: memcontrol: d...
1222
  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1223
  {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1224
  	struct mem_cgroup *task_memcg;
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1225
  	struct task_struct *p;
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1226
  	bool ret;
4c4a22148   David Rientjes   memcontrol: move ...
1227

158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1228
  	p = find_lock_task_mm(task);
de077d222   David Rientjes   oom, memcg: fix e...
1229
  	if (p) {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1230
  		task_memcg = get_mem_cgroup_from_mm(p->mm);
de077d222   David Rientjes   oom, memcg: fix e...
1231
1232
1233
1234
1235
1236
1237
  		task_unlock(p);
  	} else {
  		/*
  		 * All threads may have already detached their mm's, but the oom
  		 * killer still needs to detect if they have already been oom
  		 * killed to prevent needlessly killing additional tasks.
  		 */
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1238
  		rcu_read_lock();
2314b42db   Johannes Weiner   mm: memcontrol: d...
1239
1240
  		task_memcg = mem_cgroup_from_task(task);
  		css_get(&task_memcg->css);
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1241
  		rcu_read_unlock();
de077d222   David Rientjes   oom, memcg: fix e...
1242
  	}
2314b42db   Johannes Weiner   mm: memcontrol: d...
1243
1244
  	ret = mem_cgroup_is_descendant(task_memcg, memcg);
  	css_put(&task_memcg->css);
4c4a22148   David Rientjes   memcontrol: move ...
1245
1246
  	return ret;
  }
c56d5c7df   Konstantin Khlebnikov   mm/vmscan: push l...
1247
  int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
14797e236   KOSAKI Motohiro   memcg: add inacti...
1248
  {
9b272977e   Johannes Weiner   memcg: skip scann...
1249
  	unsigned long inactive_ratio;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1250
  	unsigned long inactive;
9b272977e   Johannes Weiner   memcg: skip scann...
1251
  	unsigned long active;
c772be939   KOSAKI Motohiro   memcg: fix calcul...
1252
  	unsigned long gb;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1253

4d7dcca21   Hugh Dickins   mm/memcg: get_lru...
1254
1255
  	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
  	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
14797e236   KOSAKI Motohiro   memcg: add inacti...
1256

c772be939   KOSAKI Motohiro   memcg: fix calcul...
1257
1258
1259
1260
1261
  	gb = (inactive + active) >> (30 - PAGE_SHIFT);
  	if (gb)
  		inactive_ratio = int_sqrt(10 * gb);
  	else
  		inactive_ratio = 1;
9b272977e   Johannes Weiner   memcg: skip scann...
1262
  	return inactive * inactive_ratio < active;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1263
  }
90cbc2508   Vladimir Davydov   vmscan: force sca...
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
  bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
  {
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup *memcg;
  
  	if (mem_cgroup_disabled())
  		return true;
  
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	memcg = mz->memcg;
  
  	return !!(memcg->css.flags & CSS_ONLINE);
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1277
  #define mem_cgroup_from_counter(counter, member)	\
6d61ef409   Balbir Singh   memcg: memory cgr...
1278
  	container_of(counter, struct mem_cgroup, member)
19942822d   Johannes Weiner   memcg: prevent en...
1279
  /**
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1280
   * mem_cgroup_margin - calculate chargeable space of a memory cgroup
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1281
   * @memcg: the memory cgroup
19942822d   Johannes Weiner   memcg: prevent en...
1282
   *
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1283
   * Returns the maximum amount of memory @mem can be charged with, in
7ec99d621   Johannes Weiner   memcg: unify char...
1284
   * pages.
19942822d   Johannes Weiner   memcg: prevent en...
1285
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1286
  static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
19942822d   Johannes Weiner   memcg: prevent en...
1287
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1288
1289
1290
  	unsigned long margin = 0;
  	unsigned long count;
  	unsigned long limit;
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1291

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1292
  	count = page_counter_read(&memcg->memory);
4db0c3c29   Jason Low   mm: remove rest o...
1293
  	limit = READ_ONCE(memcg->memory.limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1294
1295
1296
1297
1298
  	if (count < limit)
  		margin = limit - count;
  
  	if (do_swap_account) {
  		count = page_counter_read(&memcg->memsw);
4db0c3c29   Jason Low   mm: remove rest o...
1299
  		limit = READ_ONCE(memcg->memsw.limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1300
1301
1302
1303
1304
  		if (count <= limit)
  			margin = min(margin, limit - count);
  	}
  
  	return margin;
19942822d   Johannes Weiner   memcg: prevent en...
1305
  }
1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
1306
  int mem_cgroup_swappiness(struct mem_cgroup *memcg)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1307
  {
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1308
  	/* root ? */
14208b0ec   Linus Torvalds   Merge branch 'for...
1309
  	if (mem_cgroup_disabled() || !memcg->css.parent)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1310
  		return vm_swappiness;
bf1ff2635   Johannes Weiner   memcg: remove mem...
1311
  	return memcg->swappiness;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1312
  }
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1313
  /*
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1314
   * A routine for checking "mem" is under move_account() or not.
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1315
   *
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1316
1317
1318
   * Checking a cgroup is mc.from or mc.to or under hierarchy of
   * moving cgroups. This is for waiting at high-memory pressure
   * caused by "move".
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1319
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1320
  static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1321
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1322
1323
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1324
  	bool ret = false;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1325
1326
1327
1328
1329
1330
1331
1332
1333
  	/*
  	 * Unlike task_move routines, we access mc.to, mc.from not under
  	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
  	 */
  	spin_lock(&mc.lock);
  	from = mc.from;
  	to = mc.to;
  	if (!from)
  		goto unlock;
3e92041d6   Michal Hocko   memcg: add mem_cg...
1334

2314b42db   Johannes Weiner   mm: memcontrol: d...
1335
1336
  	ret = mem_cgroup_is_descendant(from, memcg) ||
  		mem_cgroup_is_descendant(to, memcg);
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1337
1338
  unlock:
  	spin_unlock(&mc.lock);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1339
1340
  	return ret;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1341
  static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1342
1343
  {
  	if (mc.moving_task && current != mc.moving_task) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1344
  		if (mem_cgroup_under_move(memcg)) {
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
  			DEFINE_WAIT(wait);
  			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
  			/* moving charge context might have finished. */
  			if (mc.moving_task)
  				schedule();
  			finish_wait(&mc.waitq, &wait);
  			return true;
  		}
  	}
  	return false;
  }
58cf188ed   Sha Zhengju   memcg, oom: provi...
1356
  #define K(x) ((x) << (PAGE_SHIFT-10))
e222432bf   Balbir Singh   memcg: show memcg...
1357
  /**
58cf188ed   Sha Zhengju   memcg, oom: provi...
1358
   * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
e222432bf   Balbir Singh   memcg: show memcg...
1359
1360
1361
1362
1363
1364
1365
1366
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
   * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
   * enabled
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
e61734c55   Tejun Heo   cgroup: remove cg...
1367
  	/* oom_info_lock ensures that parallel ooms do not interleave */
08088cb9a   Michal Hocko   memcg: change oom...
1368
  	static DEFINE_MUTEX(oom_info_lock);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1369
1370
  	struct mem_cgroup *iter;
  	unsigned int i;
e222432bf   Balbir Singh   memcg: show memcg...
1371

08088cb9a   Michal Hocko   memcg: change oom...
1372
  	mutex_lock(&oom_info_lock);
e222432bf   Balbir Singh   memcg: show memcg...
1373
  	rcu_read_lock();
2415b9f5c   Balasubramani Vivekanandan   memcg: print cgro...
1374
1375
1376
1377
1378
1379
1380
  	if (p) {
  		pr_info("Task in ");
  		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
  		pr_cont(" killed as a result of limit of ");
  	} else {
  		pr_info("Memory limit reached of cgroup ");
  	}
e61734c55   Tejun Heo   cgroup: remove cg...
1381
  	pr_cont_cgroup_path(memcg->css.cgroup);
0346dadbf   Greg Thelen   memcg: remove ext...
1382
1383
  	pr_cont("
  ");
e222432bf   Balbir Singh   memcg: show memcg...
1384

e222432bf   Balbir Singh   memcg: show memcg...
1385
  	rcu_read_unlock();
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
  	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->memory)),
  		K((u64)memcg->memory.limit), memcg->memory.failcnt);
  	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->memsw)),
  		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
  	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->kmem)),
  		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1398
1399
  
  	for_each_mem_cgroup_tree(iter, memcg) {
e61734c55   Tejun Heo   cgroup: remove cg...
1400
1401
  		pr_info("Memory cgroup stats for ");
  		pr_cont_cgroup_path(iter->css.cgroup);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
  		pr_cont(":");
  
  		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
  			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
  				continue;
  			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
  				K(mem_cgroup_read_stat(iter, i)));
  		}
  
  		for (i = 0; i < NR_LRU_LISTS; i++)
  			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
  				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
  
  		pr_cont("
  ");
  	}
08088cb9a   Michal Hocko   memcg: change oom...
1418
  	mutex_unlock(&oom_info_lock);
e222432bf   Balbir Singh   memcg: show memcg...
1419
  }
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1420
1421
1422
1423
  /*
   * This function returns the number of memcg under hierarchy tree. Returns
   * 1(self count) if no children.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1424
  static int mem_cgroup_count_children(struct mem_cgroup *memcg)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1425
1426
  {
  	int num = 0;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1427
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1428
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1429
  		num++;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1430
1431
  	return num;
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1432
  /*
a63d83f42   David Rientjes   oom: badness heur...
1433
1434
   * Return the memory (and swap, if configured) limit for a memcg.
   */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1435
  static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
a63d83f42   David Rientjes   oom: badness heur...
1436
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1437
  	unsigned long limit;
f3e8eb70b   Johannes Weiner   memcg: fix unit m...
1438

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1439
  	limit = memcg->memory.limit;
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1440
  	if (mem_cgroup_swappiness(memcg)) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1441
  		unsigned long memsw_limit;
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1442

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1443
1444
  		memsw_limit = memcg->memsw.limit;
  		limit = min(limit + total_swap_pages, memsw_limit);
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1445
  	}
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1446
  	return limit;
a63d83f42   David Rientjes   oom: badness heur...
1447
  }
19965460e   David Rientjes   mm, memcg: make m...
1448
1449
  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
  				     int order)
9cbb78bb3   David Rientjes   mm, memcg: introd...
1450
1451
1452
1453
1454
1455
  {
  	struct mem_cgroup *iter;
  	unsigned long chosen_points = 0;
  	unsigned long totalpages;
  	unsigned int points = 0;
  	struct task_struct *chosen = NULL;
876aafbfd   David Rientjes   mm, memcg: move a...
1456
  	/*
465adcf1e   David Rientjes   mm, memcg: give e...
1457
1458
1459
  	 * If current has a pending SIGKILL or is exiting, then automatically
  	 * select it.  The goal is to allow it to allocate so that it may
  	 * quickly exit and free its memory.
876aafbfd   David Rientjes   mm, memcg: move a...
1460
  	 */
d003f371b   Oleg Nesterov   oom: don't assume...
1461
  	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
49550b605   Michal Hocko   oom: add helpers ...
1462
  		mark_tsk_oom_victim(current);
876aafbfd   David Rientjes   mm, memcg: move a...
1463
1464
  		return;
  	}
2415b9f5c   Balasubramani Vivekanandan   memcg: print cgro...
1465
  	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1466
  	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1467
  	for_each_mem_cgroup_tree(iter, memcg) {
72ec70299   Tejun Heo   cgroup: make task...
1468
  		struct css_task_iter it;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1469
  		struct task_struct *task;
72ec70299   Tejun Heo   cgroup: make task...
1470
1471
  		css_task_iter_start(&iter->css, &it);
  		while ((task = css_task_iter_next(&it))) {
9cbb78bb3   David Rientjes   mm, memcg: introd...
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
  			switch (oom_scan_process_thread(task, totalpages, NULL,
  							false)) {
  			case OOM_SCAN_SELECT:
  				if (chosen)
  					put_task_struct(chosen);
  				chosen = task;
  				chosen_points = ULONG_MAX;
  				get_task_struct(chosen);
  				/* fall through */
  			case OOM_SCAN_CONTINUE:
  				continue;
  			case OOM_SCAN_ABORT:
72ec70299   Tejun Heo   cgroup: make task...
1484
  				css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1485
1486
1487
1488
1489
1490
1491
1492
  				mem_cgroup_iter_break(memcg, iter);
  				if (chosen)
  					put_task_struct(chosen);
  				return;
  			case OOM_SCAN_OK:
  				break;
  			};
  			points = oom_badness(task, memcg, NULL, totalpages);
d49ad9355   David Rientjes   mm, oom: prefer t...
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
  			if (!points || points < chosen_points)
  				continue;
  			/* Prefer thread group leaders for display purposes */
  			if (points == chosen_points &&
  			    thread_group_leader(chosen))
  				continue;
  
  			if (chosen)
  				put_task_struct(chosen);
  			chosen = task;
  			chosen_points = points;
  			get_task_struct(chosen);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1505
  		}
72ec70299   Tejun Heo   cgroup: make task...
1506
  		css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1507
1508
1509
1510
1511
  	}
  
  	if (!chosen)
  		return;
  	points = chosen_points * 1000 / totalpages;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1512
1513
  	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
  			 NULL, "Memory cgroup out of memory");
9cbb78bb3   David Rientjes   mm, memcg: introd...
1514
  }
ae6e71d3d   Michele Curti   mm/memcontrol.c: ...
1515
  #if MAX_NUMNODES > 1
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1516
1517
  /**
   * test_mem_cgroup_node_reclaimable
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1518
   * @memcg: the target memcg
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1519
1520
1521
1522
1523
1524
1525
   * @nid: the node ID to be checked.
   * @noswap : specify true here if the user wants flle only information.
   *
   * This function returns whether the specified memcg contains any
   * reclaimable pages on a node. Returns true if there are any reclaimable
   * pages in the node.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1526
  static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1527
1528
  		int nid, bool noswap)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1529
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1530
1531
1532
  		return true;
  	if (noswap || !total_swap_pages)
  		return false;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1533
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1534
1535
1536
1537
  		return true;
  	return false;
  
  }
889976dbc   Ying Han   memcg: reclaim me...
1538
1539
1540
1541
1542
1543
1544
  
  /*
   * Always updating the nodemask is not very good - even if we have an empty
   * list or the wrong list here, we can start from some node and traverse all
   * nodes based on the zonelist. So update the list loosely once per 10 secs.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1545
  static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1546
1547
  {
  	int nid;
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1548
1549
1550
1551
  	/*
  	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
  	 * pagein/pageout changes since the last update.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1552
  	if (!atomic_read(&memcg->numainfo_events))
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1553
  		return;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1554
  	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
889976dbc   Ying Han   memcg: reclaim me...
1555
  		return;
889976dbc   Ying Han   memcg: reclaim me...
1556
  	/* make a nodemask where this memcg uses memory from */
31aaea4aa   Lai Jiangshan   memcontrol: use N...
1557
  	memcg->scan_nodes = node_states[N_MEMORY];
889976dbc   Ying Han   memcg: reclaim me...
1558

31aaea4aa   Lai Jiangshan   memcontrol: use N...
1559
  	for_each_node_mask(nid, node_states[N_MEMORY]) {
889976dbc   Ying Han   memcg: reclaim me...
1560

c0ff4b854   Raghavendra K T   memcg: rename mem...
1561
1562
  		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
  			node_clear(nid, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1563
  	}
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1564

c0ff4b854   Raghavendra K T   memcg: rename mem...
1565
1566
  	atomic_set(&memcg->numainfo_events, 0);
  	atomic_set(&memcg->numainfo_updating, 0);
889976dbc   Ying Han   memcg: reclaim me...
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
  }
  
  /*
   * Selecting a node where we start reclaim from. Because what we need is just
   * reducing usage counter, start from anywhere is O,K. Considering
   * memory reclaim from current node, there are pros. and cons.
   *
   * Freeing memory from current node means freeing memory from a node which
   * we'll use or we've used. So, it may make LRU bad. And if several threads
   * hit limits, it will see a contention on a node. But freeing from remote
   * node means more costs for memory reclaim because of memory latency.
   *
   * Now, we use round-robin. Better algorithm is welcomed.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1581
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1582
1583
  {
  	int node;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1584
1585
  	mem_cgroup_may_update_nodemask(memcg);
  	node = memcg->last_scanned_node;
889976dbc   Ying Han   memcg: reclaim me...
1586

c0ff4b854   Raghavendra K T   memcg: rename mem...
1587
  	node = next_node(node, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1588
  	if (node == MAX_NUMNODES)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1589
  		node = first_node(memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1590
1591
1592
1593
1594
1595
1596
1597
  	/*
  	 * We call this when we hit limit, not when pages are added to LRU.
  	 * No LRU may hold pages because all pages are UNEVICTABLE or
  	 * memcg is too small and all pages are not on LRU. In that case,
  	 * we use curret node.
  	 */
  	if (unlikely(node == MAX_NUMNODES))
  		node = numa_node_id();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1598
  	memcg->last_scanned_node = node;
889976dbc   Ying Han   memcg: reclaim me...
1599
1600
  	return node;
  }
889976dbc   Ying Han   memcg: reclaim me...
1601
  #else
c0ff4b854   Raghavendra K T   memcg: rename mem...
1602
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1603
1604
1605
1606
  {
  	return 0;
  }
  #endif
0608f43da   Andrew Morton   revert "memcg, vm...
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
  static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
  				   struct zone *zone,
  				   gfp_t gfp_mask,
  				   unsigned long *total_scanned)
  {
  	struct mem_cgroup *victim = NULL;
  	int total = 0;
  	int loop = 0;
  	unsigned long excess;
  	unsigned long nr_scanned;
  	struct mem_cgroup_reclaim_cookie reclaim = {
  		.zone = zone,
  		.priority = 0,
  	};
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1621
  	excess = soft_limit_excess(root_memcg);
0608f43da   Andrew Morton   revert "memcg, vm...
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
  
  	while (1) {
  		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
  		if (!victim) {
  			loop++;
  			if (loop >= 2) {
  				/*
  				 * If we have not been able to reclaim
  				 * anything, it might because there are
  				 * no reclaimable pages under this hierarchy
  				 */
  				if (!total)
  					break;
  				/*
  				 * We want to do more targeted reclaim.
  				 * excess >> 2 is not to excessive so as to
  				 * reclaim too much, nor too less that we keep
  				 * coming back to reclaim from this cgroup
  				 */
  				if (total >= (excess >> 2) ||
  					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
  					break;
  			}
  			continue;
  		}
0608f43da   Andrew Morton   revert "memcg, vm...
1647
1648
1649
  		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
  						     zone, &nr_scanned);
  		*total_scanned += nr_scanned;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1650
  		if (!soft_limit_excess(root_memcg))
0608f43da   Andrew Morton   revert "memcg, vm...
1651
  			break;
6d61ef409   Balbir Singh   memcg: memory cgr...
1652
  	}
0608f43da   Andrew Morton   revert "memcg, vm...
1653
1654
  	mem_cgroup_iter_break(root_memcg, victim);
  	return total;
6d61ef409   Balbir Singh   memcg: memory cgr...
1655
  }
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1656
1657
1658
1659
1660
  #ifdef CONFIG_LOCKDEP
  static struct lockdep_map memcg_oom_lock_dep_map = {
  	.name = "memcg_oom_lock",
  };
  #endif
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1661
  static DEFINE_SPINLOCK(memcg_oom_lock);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1662
1663
1664
1665
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
   */
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1666
  static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1667
  {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1668
  	struct mem_cgroup *iter, *failed = NULL;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1669

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1670
  	spin_lock(&memcg_oom_lock);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1671
  	for_each_mem_cgroup_tree(iter, memcg) {
23751be00   Johannes Weiner   memcg: fix hierar...
1672
  		if (iter->oom_lock) {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1673
1674
1675
1676
  			/*
  			 * this subtree of our hierarchy is already locked
  			 * so we cannot give a lock.
  			 */
79dfdaccd   Michal Hocko   memcg: make oom_l...
1677
  			failed = iter;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1678
1679
  			mem_cgroup_iter_break(memcg, iter);
  			break;
23751be00   Johannes Weiner   memcg: fix hierar...
1680
1681
  		} else
  			iter->oom_lock = true;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1682
  	}
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1683

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
  	if (failed) {
  		/*
  		 * OK, we failed to lock the whole subtree so we have
  		 * to clean up what we set up to the failing subtree
  		 */
  		for_each_mem_cgroup_tree(iter, memcg) {
  			if (iter == failed) {
  				mem_cgroup_iter_break(memcg, iter);
  				break;
  			}
  			iter->oom_lock = false;
79dfdaccd   Michal Hocko   memcg: make oom_l...
1695
  		}
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1696
1697
  	} else
  		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1698
1699
1700
1701
  
  	spin_unlock(&memcg_oom_lock);
  
  	return !failed;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1702
  }
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1703

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1704
  static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1705
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1706
  	struct mem_cgroup *iter;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1707
  	spin_lock(&memcg_oom_lock);
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1708
  	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1709
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1710
  		iter->oom_lock = false;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1711
  	spin_unlock(&memcg_oom_lock);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1712
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1713
  static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1714
1715
  {
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1716
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1717
1718
  		atomic_inc(&iter->under_oom);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1719
  static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1720
1721
  {
  	struct mem_cgroup *iter;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1722
1723
1724
1725
1726
  	/*
  	 * When a new child is created while the hierarchy is under oom,
  	 * mem_cgroup_oom_lock() may not be called. We have to use
  	 * atomic_add_unless() here.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1727
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1728
  		atomic_add_unless(&iter->under_oom, -1, 0);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1729
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1730
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1731
  struct oom_wait_info {
d79154bb5   Hugh Dickins   memcg: replace me...
1732
  	struct mem_cgroup *memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1733
1734
1735
1736
1737
1738
  	wait_queue_t	wait;
  };
  
  static int memcg_oom_wake_function(wait_queue_t *wait,
  	unsigned mode, int sync, void *arg)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
1739
1740
  	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
  	struct mem_cgroup *oom_wait_memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1741
1742
1743
  	struct oom_wait_info *oom_wait_info;
  
  	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
d79154bb5   Hugh Dickins   memcg: replace me...
1744
  	oom_wait_memcg = oom_wait_info->memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1745

2314b42db   Johannes Weiner   mm: memcontrol: d...
1746
1747
  	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
  	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1748
  		return 0;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1749
1750
  	return autoremove_wake_function(wait, mode, sync, arg);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1751
  static void memcg_wakeup_oom(struct mem_cgroup *memcg)
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1752
  {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1753
  	atomic_inc(&memcg->oom_wakeups);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1754
1755
  	/* for filtering, pass "memcg" as argument. */
  	__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1756
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1757
  static void memcg_oom_recover(struct mem_cgroup *memcg)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1758
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1759
1760
  	if (memcg && atomic_read(&memcg->under_oom))
  		memcg_wakeup_oom(memcg);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1761
  }
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1762
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1763
  {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1764
1765
  	if (!current->memcg_oom.may_oom)
  		return;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1766
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
  	 * We are in the middle of the charge context here, so we
  	 * don't want to block when potentially sitting on a callstack
  	 * that holds all kinds of filesystem and mm locks.
  	 *
  	 * Also, the caller may handle a failed allocation gracefully
  	 * (like optional page cache readahead) and so an OOM killer
  	 * invocation might not even be necessary.
  	 *
  	 * That's why we don't do anything here except remember the
  	 * OOM context and then deal with it at the end of the page
  	 * fault when the stack is unwound, the locks are released,
  	 * and when we know whether the fault was overall successful.
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1779
  	 */
494264208   Johannes Weiner   mm: memcg: handle...
1780
1781
1782
1783
  	css_get(&memcg->css);
  	current->memcg_oom.memcg = memcg;
  	current->memcg_oom.gfp_mask = mask;
  	current->memcg_oom.order = order;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1784
1785
1786
1787
  }
  
  /**
   * mem_cgroup_oom_synchronize - complete memcg OOM handling
494264208   Johannes Weiner   mm: memcg: handle...
1788
   * @handle: actually kill/wait or just clean up the OOM state
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1789
   *
494264208   Johannes Weiner   mm: memcg: handle...
1790
1791
   * This has to be called at the end of a page fault if the memcg OOM
   * handler was enabled.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1792
   *
494264208   Johannes Weiner   mm: memcg: handle...
1793
   * Memcg supports userspace OOM handling where failed allocations must
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1794
1795
1796
1797
   * sleep on a waitqueue until the userspace task resolves the
   * situation.  Sleeping directly in the charge context with all kinds
   * of locks held is not a good idea, instead we remember an OOM state
   * in the task and mem_cgroup_oom_synchronize() has to be called at
494264208   Johannes Weiner   mm: memcg: handle...
1798
   * the end of the page fault to complete the OOM handling.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1799
1800
   *
   * Returns %true if an ongoing memcg OOM situation was detected and
494264208   Johannes Weiner   mm: memcg: handle...
1801
   * completed, %false otherwise.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1802
   */
494264208   Johannes Weiner   mm: memcg: handle...
1803
  bool mem_cgroup_oom_synchronize(bool handle)
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1804
  {
494264208   Johannes Weiner   mm: memcg: handle...
1805
  	struct mem_cgroup *memcg = current->memcg_oom.memcg;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1806
  	struct oom_wait_info owait;
494264208   Johannes Weiner   mm: memcg: handle...
1807
  	bool locked;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1808
1809
  
  	/* OOM is global, do not handle */
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1810
  	if (!memcg)
494264208   Johannes Weiner   mm: memcg: handle...
1811
  		return false;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1812

c32b3cbe0   Michal Hocko   oom, PM: make OOM...
1813
  	if (!handle || oom_killer_disabled)
494264208   Johannes Weiner   mm: memcg: handle...
1814
  		goto cleanup;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1815
1816
1817
1818
1819
1820
  
  	owait.memcg = memcg;
  	owait.wait.flags = 0;
  	owait.wait.func = memcg_oom_wake_function;
  	owait.wait.private = current;
  	INIT_LIST_HEAD(&owait.wait.task_list);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1821

3812c8c8f   Johannes Weiner   mm: memcg: do not...
1822
  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
494264208   Johannes Weiner   mm: memcg: handle...
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
  	mem_cgroup_mark_under_oom(memcg);
  
  	locked = mem_cgroup_oom_trylock(memcg);
  
  	if (locked)
  		mem_cgroup_oom_notify(memcg);
  
  	if (locked && !memcg->oom_kill_disable) {
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
  		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
  					 current->memcg_oom.order);
  	} else {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1836
  		schedule();
494264208   Johannes Weiner   mm: memcg: handle...
1837
1838
1839
1840
1841
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
  	}
  
  	if (locked) {
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1842
1843
1844
1845
1846
1847
1848
1849
  		mem_cgroup_oom_unlock(memcg);
  		/*
  		 * There is no guarantee that an OOM-lock contender
  		 * sees the wakeups triggered by the OOM kill
  		 * uncharges.  Wake any sleepers explicitely.
  		 */
  		memcg_oom_recover(memcg);
  	}
494264208   Johannes Weiner   mm: memcg: handle...
1850
1851
  cleanup:
  	current->memcg_oom.memcg = NULL;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1852
  	css_put(&memcg->css);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1853
  	return true;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1854
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
1855
1856
1857
  /**
   * mem_cgroup_begin_page_stat - begin a page state statistics transaction
   * @page: page that is going to change accounted state
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1858
   *
d7365e783   Johannes Weiner   mm: memcontrol: f...
1859
1860
1861
   * This function must mark the beginning of an accounted page state
   * change to prevent double accounting when the page is concurrently
   * being moved to another memcg:
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1862
   *
6de226191   Johannes Weiner   mm: memcontrol: t...
1863
   *   memcg = mem_cgroup_begin_page_stat(page);
d7365e783   Johannes Weiner   mm: memcontrol: f...
1864
1865
   *   if (TestClearPageState(page))
   *     mem_cgroup_update_page_stat(memcg, state, -1);
6de226191   Johannes Weiner   mm: memcontrol: t...
1866
   *   mem_cgroup_end_page_stat(memcg);
d69b042f3   Balbir Singh   memcg: add file-b...
1867
   */
6de226191   Johannes Weiner   mm: memcontrol: t...
1868
  struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1869
1870
  {
  	struct mem_cgroup *memcg;
6de226191   Johannes Weiner   mm: memcontrol: t...
1871
  	unsigned long flags;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1872

6de226191   Johannes Weiner   mm: memcontrol: t...
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
  	/*
  	 * The RCU lock is held throughout the transaction.  The fast
  	 * path can get away without acquiring the memcg->move_lock
  	 * because page moving starts with an RCU grace period.
  	 *
  	 * The RCU lock also protects the memcg from being freed when
  	 * the page state that is going to change is the only thing
  	 * preventing the page from being uncharged.
  	 * E.g. end-writeback clearing PageWriteback(), which allows
  	 * migration to go ahead and uncharge the page before the
  	 * account transaction might be complete.
  	 */
d7365e783   Johannes Weiner   mm: memcontrol: f...
1885
1886
1887
1888
  	rcu_read_lock();
  
  	if (mem_cgroup_disabled())
  		return NULL;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1889
  again:
1306a85ae   Johannes Weiner   mm: embed the mem...
1890
  	memcg = page->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
1891
  	if (unlikely(!memcg))
d7365e783   Johannes Weiner   mm: memcontrol: f...
1892
  		return NULL;
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1893
  	if (atomic_read(&memcg->moving_account) <= 0)
d7365e783   Johannes Weiner   mm: memcontrol: f...
1894
  		return memcg;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1895

6de226191   Johannes Weiner   mm: memcontrol: t...
1896
  	spin_lock_irqsave(&memcg->move_lock, flags);
1306a85ae   Johannes Weiner   mm: embed the mem...
1897
  	if (memcg != page->mem_cgroup) {
6de226191   Johannes Weiner   mm: memcontrol: t...
1898
  		spin_unlock_irqrestore(&memcg->move_lock, flags);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1899
1900
  		goto again;
  	}
6de226191   Johannes Weiner   mm: memcontrol: t...
1901
1902
1903
1904
1905
1906
1907
1908
  
  	/*
  	 * When charge migration first begins, we can have locked and
  	 * unlocked page stat updates happening concurrently.  Track
  	 * the task who has the lock for mem_cgroup_end_page_stat().
  	 */
  	memcg->move_lock_task = current;
  	memcg->move_lock_flags = flags;
d7365e783   Johannes Weiner   mm: memcontrol: f...
1909
1910
  
  	return memcg;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1911
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
1912
1913
1914
  /**
   * mem_cgroup_end_page_stat - finish a page state statistics transaction
   * @memcg: the memcg that was accounted against
d7365e783   Johannes Weiner   mm: memcontrol: f...
1915
   */
6de226191   Johannes Weiner   mm: memcontrol: t...
1916
  void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1917
  {
6de226191   Johannes Weiner   mm: memcontrol: t...
1918
1919
1920
1921
1922
1923
1924
1925
  	if (memcg && memcg->move_lock_task == current) {
  		unsigned long flags = memcg->move_lock_flags;
  
  		memcg->move_lock_task = NULL;
  		memcg->move_lock_flags = 0;
  
  		spin_unlock_irqrestore(&memcg->move_lock, flags);
  	}
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1926

d7365e783   Johannes Weiner   mm: memcontrol: f...
1927
  	rcu_read_unlock();
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1928
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
1929
1930
1931
1932
1933
1934
1935
1936
1937
  /**
   * mem_cgroup_update_page_stat - update page state statistics
   * @memcg: memcg to account against
   * @idx: page state item to account
   * @val: number of pages (positive or negative)
   *
   * See mem_cgroup_begin_page_stat() for locking requirements.
   */
  void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
68b4876d9   Sha Zhengju   memcg: remove MEM...
1938
  				 enum mem_cgroup_stat_index idx, int val)
d69b042f3   Balbir Singh   memcg: add file-b...
1939
  {
658b72c5a   Sha Zhengju   memcg: check for ...
1940
  	VM_BUG_ON(!rcu_read_lock_held());
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1941

d7365e783   Johannes Weiner   mm: memcontrol: f...
1942
1943
  	if (memcg)
  		this_cpu_add(memcg->stat->count[idx], val);
d69b042f3   Balbir Singh   memcg: add file-b...
1944
  }
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1945

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1946
  /*
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1947
1948
1949
   * size of first charge trial. "32" comes from vmscan.c's magic value.
   * TODO: maybe necessary to use big numbers in big irons.
   */
7ec99d621   Johannes Weiner   memcg: unify char...
1950
  #define CHARGE_BATCH	32U
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1951
1952
  struct memcg_stock_pcp {
  	struct mem_cgroup *cached; /* this never be root cgroup */
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1953
  	unsigned int nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1954
  	struct work_struct work;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1955
  	unsigned long flags;
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
1956
  #define FLUSHING_CACHED_CHARGE	0
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1957
1958
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
9f50fad65   Michal Hocko   Revert "memcg: ge...
1959
  static DEFINE_MUTEX(percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1960

a0956d544   Suleiman Souhlal   memcg: make it po...
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
  /**
   * consume_stock: Try to consume stocked charge on this cpu.
   * @memcg: memcg to consume from.
   * @nr_pages: how many pages to charge.
   *
   * The charges will only happen if @memcg matches the current cpu's memcg
   * stock, and at least @nr_pages are available in that stock.  Failure to
   * service an allocation will refill the stock.
   *
   * returns true if successful, false otherwise.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1971
   */
a0956d544   Suleiman Souhlal   memcg: make it po...
1972
  static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1973
1974
  {
  	struct memcg_stock_pcp *stock;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1975
  	bool ret = false;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1976

a0956d544   Suleiman Souhlal   memcg: make it po...
1977
  	if (nr_pages > CHARGE_BATCH)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1978
  		return ret;
a0956d544   Suleiman Souhlal   memcg: make it po...
1979

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1980
  	stock = &get_cpu_var(memcg_stock);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1981
  	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
a0956d544   Suleiman Souhlal   memcg: make it po...
1982
  		stock->nr_pages -= nr_pages;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1983
1984
  		ret = true;
  	}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1985
1986
1987
1988
1989
  	put_cpu_var(memcg_stock);
  	return ret;
  }
  
  /*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1990
   * Returns stocks cached in percpu and reset cached information.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1991
1992
1993
1994
   */
  static void drain_stock(struct memcg_stock_pcp *stock)
  {
  	struct mem_cgroup *old = stock->cached;
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1995
  	if (stock->nr_pages) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1996
  		page_counter_uncharge(&old->memory, stock->nr_pages);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1997
  		if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1998
  			page_counter_uncharge(&old->memsw, stock->nr_pages);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
1999
  		css_put_many(&old->css, stock->nr_pages);
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2000
  		stock->nr_pages = 0;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2001
2002
  	}
  	stock->cached = NULL;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2003
2004
2005
2006
2007
2008
2009
2010
  }
  
  /*
   * This must be called under preempt disabled or must be called by
   * a thread which is pinned to local cpu.
   */
  static void drain_local_stock(struct work_struct *dummy)
  {
7c8e0181e   Christoph Lameter   mm: replace __get...
2011
  	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2012
  	drain_stock(stock);
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2013
  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2014
2015
2016
  }
  
  /*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2017
   * Cache charges(val) to local per_cpu area.
320cc51d9   Greg Thelen   mm: fix typo in r...
2018
   * This will be consumed by consume_stock() function, later.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2019
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2020
  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2021
2022
  {
  	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2023
  	if (stock->cached != memcg) { /* reset if necessary */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2024
  		drain_stock(stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2025
  		stock->cached = memcg;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2026
  	}
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2027
  	stock->nr_pages += nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2028
2029
2030
2031
  	put_cpu_var(memcg_stock);
  }
  
  /*
c0ff4b854   Raghavendra K T   memcg: rename mem...
2032
   * Drains all per-CPU charge caches for given root_memcg resp. subtree
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2033
   * of the hierarchy under it.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2034
   */
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2035
  static void drain_all_stock(struct mem_cgroup *root_memcg)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2036
  {
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2037
  	int cpu, curcpu;
d38144b7a   Michal Hocko   memcg: unify sync...
2038

6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2039
2040
2041
  	/* If someone's already draining, avoid adding running more workers. */
  	if (!mutex_trylock(&percpu_charge_mutex))
  		return;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2042
  	/* Notify other cpus that system-wide "drain" is running */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2043
  	get_online_cpus();
5af12d0ef   Johannes Weiner   memcg: pin execut...
2044
  	curcpu = get_cpu();
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2045
2046
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2047
  		struct mem_cgroup *memcg;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2048

c0ff4b854   Raghavendra K T   memcg: rename mem...
2049
2050
  		memcg = stock->cached;
  		if (!memcg || !stock->nr_pages)
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2051
  			continue;
2314b42db   Johannes Weiner   mm: memcontrol: d...
2052
  		if (!mem_cgroup_is_descendant(memcg, root_memcg))
3e92041d6   Michal Hocko   memcg: add mem_cg...
2053
  			continue;
d1a05b697   Michal Hocko   memcg: do not try...
2054
2055
2056
2057
2058
2059
  		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
  			if (cpu == curcpu)
  				drain_local_stock(&stock->work);
  			else
  				schedule_work_on(cpu, &stock->work);
  		}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2060
  	}
5af12d0ef   Johannes Weiner   memcg: pin execut...
2061
  	put_cpu();
f894ffa86   Andrew Morton   memcg: trivial cl...
2062
  	put_online_cpus();
9f50fad65   Michal Hocko   Revert "memcg: ge...
2063
  	mutex_unlock(&percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2064
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2065
2066
2067
2068
  /*
   * This function drains percpu counter value from DEAD cpu and
   * move it to local cpu. Note that this function can be preempted.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2069
  static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2070
2071
  {
  	int i;
c0ff4b854   Raghavendra K T   memcg: rename mem...
2072
  	spin_lock(&memcg->pcp_counter_lock);
6104621de   Johannes Weiner   mm: memcg: remove...
2073
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2074
  		long x = per_cpu(memcg->stat->count[i], cpu);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2075

c0ff4b854   Raghavendra K T   memcg: rename mem...
2076
2077
  		per_cpu(memcg->stat->count[i], cpu) = 0;
  		memcg->nocpu_base.count[i] += x;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2078
  	}
e9f8974f2   Johannes Weiner   memcg: break out ...
2079
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
2080
  		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
2081

c0ff4b854   Raghavendra K T   memcg: rename mem...
2082
2083
  		per_cpu(memcg->stat->events[i], cpu) = 0;
  		memcg->nocpu_base.events[i] += x;
e9f8974f2   Johannes Weiner   memcg: break out ...
2084
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2085
  	spin_unlock(&memcg->pcp_counter_lock);
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2086
  }
0db0628d9   Paul Gortmaker   kernel: delete __...
2087
  static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2088
2089
2090
2091
2092
  					unsigned long action,
  					void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
  	struct memcg_stock_pcp *stock;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2093
  	struct mem_cgroup *iter;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2094

619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
2095
  	if (action == CPU_ONLINE)
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2096
  		return NOTIFY_OK;
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2097

d833049bd   Kirill A. Shutemov   memcg: fix broken...
2098
  	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2099
  		return NOTIFY_OK;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2100

9f3a0d093   Johannes Weiner   mm: memcg: consol...
2101
  	for_each_mem_cgroup(iter)
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2102
  		mem_cgroup_drain_pcp_counter(iter, cpu);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2103
2104
2105
2106
  	stock = &per_cpu(memcg_stock, cpu);
  	drain_stock(stock);
  	return NOTIFY_OK;
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
2107
2108
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
  		      unsigned int nr_pages)
8a9f3ccd2   Balbir Singh   Memory controller...
2109
  {
7ec99d621   Johannes Weiner   memcg: unify char...
2110
  	unsigned int batch = max(CHARGE_BATCH, nr_pages);
9b1306192   Johannes Weiner   mm: memcontrol: r...
2111
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2112
  	struct mem_cgroup *mem_over_limit;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2113
  	struct page_counter *counter;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2114
  	unsigned long nr_reclaimed;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2115
2116
  	bool may_swap = true;
  	bool drained = false;
05b843012   Johannes Weiner   mm: memcontrol: u...
2117
  	int ret = 0;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
2118

ce00a9673   Johannes Weiner   mm: memcontrol: r...
2119
2120
  	if (mem_cgroup_is_root(memcg))
  		goto done;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2121
  retry:
b6b6cc72b   Michal Hocko   memcg: do not rep...
2122
2123
  	if (consume_stock(memcg, nr_pages))
  		goto done;
8a9f3ccd2   Balbir Singh   Memory controller...
2124

3fbe72442   Johannes Weiner   mm: memcontrol: s...
2125
  	if (!do_swap_account ||
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2126
2127
  	    !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
  		if (!page_counter_try_charge(&memcg->memory, batch, &counter))
6539cc053   Johannes Weiner   mm: memcontrol: f...
2128
  			goto done_restock;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2129
  		if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2130
2131
  			page_counter_uncharge(&memcg->memsw, batch);
  		mem_over_limit = mem_cgroup_from_counter(counter, memory);
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2132
  	} else {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2133
  		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2134
  		may_swap = false;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2135
  	}
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2136

6539cc053   Johannes Weiner   mm: memcontrol: f...
2137
2138
2139
2140
  	if (batch > nr_pages) {
  		batch = nr_pages;
  		goto retry;
  	}
6d61ef409   Balbir Singh   memcg: memory cgr...
2141

06b078fc0   Johannes Weiner   mm: memcontrol: r...
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
  	/*
  	 * Unlike in global OOM situations, memcg is not in a physical
  	 * memory shortage.  Allow dying and OOM-killed tasks to
  	 * bypass the last charges so that they can exit quickly and
  	 * free their memory.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
  		     fatal_signal_pending(current) ||
  		     current->flags & PF_EXITING))
  		goto bypass;
  
  	if (unlikely(task_in_memcg_oom(current)))
  		goto nomem;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2155
2156
  	if (!(gfp_mask & __GFP_WAIT))
  		goto nomem;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2157

241994ed8   Johannes Weiner   mm: memcontrol: d...
2158
  	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2159
2160
  	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
  						    gfp_mask, may_swap);
6539cc053   Johannes Weiner   mm: memcontrol: f...
2161

61e02c745   Johannes Weiner   mm: memcontrol: c...
2162
  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
6539cc053   Johannes Weiner   mm: memcontrol: f...
2163
  		goto retry;
28c34c291   Johannes Weiner   mm: memcontrol: r...
2164

b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2165
  	if (!drained) {
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2166
  		drain_all_stock(mem_over_limit);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2167
2168
2169
  		drained = true;
  		goto retry;
  	}
28c34c291   Johannes Weiner   mm: memcontrol: r...
2170
2171
  	if (gfp_mask & __GFP_NORETRY)
  		goto nomem;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2172
2173
2174
2175
2176
2177
2178
2179
2180
  	/*
  	 * Even though the limit is exceeded at this point, reclaim
  	 * may have been able to free some pages.  Retry the charge
  	 * before killing the task.
  	 *
  	 * Only for regular pages, though: huge pages are rather
  	 * unlikely to succeed so close to the limit, and we fall back
  	 * to regular pages anyway in case of failure.
  	 */
61e02c745   Johannes Weiner   mm: memcontrol: c...
2181
  	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
6539cc053   Johannes Weiner   mm: memcontrol: f...
2182
2183
2184
2185
2186
2187
2188
  		goto retry;
  	/*
  	 * At task move, charge accounts can be doubly counted. So, it's
  	 * better to wait until the end of task_move if something is going on.
  	 */
  	if (mem_cgroup_wait_acct_move(mem_over_limit))
  		goto retry;
9b1306192   Johannes Weiner   mm: memcontrol: r...
2189
2190
  	if (nr_retries--)
  		goto retry;
06b078fc0   Johannes Weiner   mm: memcontrol: r...
2191
2192
  	if (gfp_mask & __GFP_NOFAIL)
  		goto bypass;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2193
2194
  	if (fatal_signal_pending(current))
  		goto bypass;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2195
  	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
61e02c745   Johannes Weiner   mm: memcontrol: c...
2196
  	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2197
  nomem:
6d1fdc489   Johannes Weiner   memcg: sanitize _...
2198
  	if (!(gfp_mask & __GFP_NOFAIL))
3168ecbe1   Johannes Weiner   mm: memcg: use pr...
2199
  		return -ENOMEM;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2200
  bypass:
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2201
  	return -EINTR;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2202
2203
  
  done_restock:
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2204
  	css_get_many(&memcg->css, batch);
6539cc053   Johannes Weiner   mm: memcontrol: f...
2205
2206
  	if (batch > nr_pages)
  		refill_stock(memcg, batch - nr_pages);
7d638093d   Vladimir Davydov   memcg: do not cal...
2207
2208
  	if (!(gfp_mask & __GFP_WAIT))
  		goto done;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
  	/*
  	 * If the hierarchy is above the normal consumption range,
  	 * make the charging task trim their excess contribution.
  	 */
  	do {
  		if (page_counter_read(&memcg->memory) <= memcg->high)
  			continue;
  		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
  		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
  	} while ((memcg = parent_mem_cgroup(memcg)));
6539cc053   Johannes Weiner   mm: memcontrol: f...
2219
  done:
05b843012   Johannes Weiner   mm: memcontrol: u...
2220
  	return ret;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2221
  }
8a9f3ccd2   Balbir Singh   Memory controller...
2222

00501b531   Johannes Weiner   mm: memcontrol: r...
2223
  static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2224
  {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2225
2226
  	if (mem_cgroup_is_root(memcg))
  		return;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2227
  	page_counter_uncharge(&memcg->memory, nr_pages);
05b843012   Johannes Weiner   mm: memcontrol: u...
2228
  	if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2229
  		page_counter_uncharge(&memcg->memsw, nr_pages);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2230

e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2231
  	css_put_many(&memcg->css, nr_pages);
d01dd17f1   KAMEZAWA Hiroyuki   memcg: use res_co...
2232
2233
2234
  }
  
  /*
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2235
2236
2237
2238
2239
2240
2241
2242
2243
   * try_get_mem_cgroup_from_page - look up page's memcg association
   * @page: the page
   *
   * Look up, get a css reference, and return the memcg that owns @page.
   *
   * The page must be locked to prevent racing with swap-in and page
   * cache charges.  If coming from an unlocked page table, the caller
   * must ensure the page is on the LRU or this can race with charging.
   */
e42d9d5d4   Wu Fengguang   memcg: rename and...
2244
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2245
  {
298333157   Johannes Weiner   mm: memcontrol: r...
2246
  	struct mem_cgroup *memcg;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2247
  	unsigned short id;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2248
  	swp_entry_t ent;
309381fea   Sasha Levin   mm: dump page whe...
2249
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
3c776e646   Daisuke Nishimura   memcg: charge swa...
2250

1306a85ae   Johannes Weiner   mm: embed the mem...
2251
  	memcg = page->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
2252
2253
  	if (memcg) {
  		if (!css_tryget_online(&memcg->css))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2254
  			memcg = NULL;
e42d9d5d4   Wu Fengguang   memcg: rename and...
2255
  	} else if (PageSwapCache(page)) {
3c776e646   Daisuke Nishimura   memcg: charge swa...
2256
  		ent.val = page_private(page);
9fb4b7cc0   Bob Liu   page_cgroup: add ...
2257
  		id = lookup_swap_cgroup_id(ent);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2258
  		rcu_read_lock();
adbe427b9   Vladimir Davydov   memcg: zap mem_cg...
2259
  		memcg = mem_cgroup_from_id(id);
ec903c0c8   Tejun Heo   cgroup: rename cs...
2260
  		if (memcg && !css_tryget_online(&memcg->css))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2261
  			memcg = NULL;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2262
  		rcu_read_unlock();
3c776e646   Daisuke Nishimura   memcg: charge swa...
2263
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2264
  	return memcg;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2265
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
  static void lock_page_lru(struct page *page, int *isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	spin_lock_irq(&zone->lru_lock);
  	if (PageLRU(page)) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		ClearPageLRU(page);
  		del_page_from_lru_list(page, lruvec, page_lru(page));
  		*isolated = 1;
  	} else
  		*isolated = 0;
  }
  
  static void unlock_page_lru(struct page *page, int isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	if (isolated) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		SetPageLRU(page);
  		add_page_to_lru_list(page, lruvec, page_lru(page));
  	}
  	spin_unlock_irq(&zone->lru_lock);
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
2296
  static void commit_charge(struct page *page, struct mem_cgroup *memcg,
6abb5a867   Johannes Weiner   mm: memcontrol: a...
2297
  			  bool lrucare)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2298
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2299
  	int isolated;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2300

1306a85ae   Johannes Weiner   mm: embed the mem...
2301
  	VM_BUG_ON_PAGE(page->mem_cgroup, page);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2302
2303
2304
2305
2306
  
  	/*
  	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
  	 * may already be on some other mem_cgroup's LRU.  Take care of it.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2307
2308
  	if (lrucare)
  		lock_page_lru(page, &isolated);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2309

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2310
2311
  	/*
  	 * Nobody should be changing or seriously looking at
1306a85ae   Johannes Weiner   mm: embed the mem...
2312
  	 * page->mem_cgroup at this point:
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
  	 *
  	 * - the page is uncharged
  	 *
  	 * - the page is off-LRU
  	 *
  	 * - an anonymous fault has exclusive page access, except for
  	 *   a locked page table
  	 *
  	 * - a page cache insertion, a swapin fault, or a migration
  	 *   have the page locked
  	 */
1306a85ae   Johannes Weiner   mm: embed the mem...
2324
  	page->mem_cgroup = memcg;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2325

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2326
2327
  	if (lrucare)
  		unlock_page_lru(page, isolated);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2328
  }
66e1707bc   Balbir Singh   Memory controller...
2329

7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2330
  #ifdef CONFIG_MEMCG_KMEM
dbf22eb6d   Vladimir Davydov   memcg: zap __memc...
2331
2332
  int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
  		      unsigned long nr_pages)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2333
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2334
  	struct page_counter *counter;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2335
  	int ret = 0;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2336

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2337
2338
  	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
  	if (ret < 0)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2339
  		return ret;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2340
  	ret = try_charge(memcg, gfp, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2341
2342
  	if (ret == -EINTR)  {
  		/*
00501b531   Johannes Weiner   mm: memcontrol: r...
2343
2344
2345
2346
2347
2348
  		 * try_charge() chose to bypass to root due to OOM kill or
  		 * fatal signal.  Since our only options are to either fail
  		 * the allocation or charge it to this cgroup, do it as a
  		 * temporary condition. But we can't fail. From a kmem/slab
  		 * perspective, the cache has already been selected, by
  		 * mem_cgroup_kmem_get_cache(), so it is too late to change
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2349
2350
2351
  		 * our minds.
  		 *
  		 * This condition will only trigger if the task entered
00501b531   Johannes Weiner   mm: memcontrol: r...
2352
2353
2354
  		 * memcg_charge_kmem in a sane state, but was OOM-killed
  		 * during try_charge() above. Tasks that were already dying
  		 * when the allocation triggers should have been already
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2355
2356
  		 * directed to the root cgroup in memcontrol.h
  		 */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2357
  		page_counter_charge(&memcg->memory, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2358
  		if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2359
  			page_counter_charge(&memcg->memsw, nr_pages);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2360
  		css_get_many(&memcg->css, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2361
2362
  		ret = 0;
  	} else if (ret)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2363
  		page_counter_uncharge(&memcg->kmem, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2364
2365
2366
  
  	return ret;
  }
dbf22eb6d   Vladimir Davydov   memcg: zap __memc...
2367
  void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2368
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2369
  	page_counter_uncharge(&memcg->memory, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2370
  	if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2371
  		page_counter_uncharge(&memcg->memsw, nr_pages);
7de37682b   Glauber Costa   memcg: kmem accou...
2372

64f219938   Johannes Weiner   mm: memcontrol: r...
2373
  	page_counter_uncharge(&memcg->kmem, nr_pages);
7de37682b   Glauber Costa   memcg: kmem accou...
2374

e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2375
  	css_put_many(&memcg->css, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2376
  }
2633d7a02   Glauber Costa   slab/slub: consid...
2377
2378
2379
2380
2381
2382
2383
2384
2385
  /*
   * helper for acessing a memcg's index. It will be used as an index in the
   * child cache array in kmem_cache, and also to derive its name. This function
   * will return -1 when this is not a kmem-limited memcg.
   */
  int memcg_cache_id(struct mem_cgroup *memcg)
  {
  	return memcg ? memcg->kmemcg_id : -1;
  }
f3bb3043a   Vladimir Davydov   memcg: don't call...
2386
  static int memcg_alloc_cache_id(void)
55007d849   Glauber Costa   memcg: allocate m...
2387
  {
f3bb3043a   Vladimir Davydov   memcg: don't call...
2388
2389
  	int id, size;
  	int err;
dbcf73e26   Vladimir Davydov   memcg: rename som...
2390
  	id = ida_simple_get(&memcg_cache_ida,
f3bb3043a   Vladimir Davydov   memcg: don't call...
2391
2392
2393
  			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
  	if (id < 0)
  		return id;
55007d849   Glauber Costa   memcg: allocate m...
2394

dbcf73e26   Vladimir Davydov   memcg: rename som...
2395
  	if (id < memcg_nr_cache_ids)
f3bb3043a   Vladimir Davydov   memcg: don't call...
2396
2397
2398
2399
2400
2401
  		return id;
  
  	/*
  	 * There's no space for the new id in memcg_caches arrays,
  	 * so we have to grow them.
  	 */
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2402
  	down_write(&memcg_cache_ids_sem);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2403
2404
  
  	size = 2 * (id + 1);
55007d849   Glauber Costa   memcg: allocate m...
2405
2406
2407
2408
  	if (size < MEMCG_CACHES_MIN_SIZE)
  		size = MEMCG_CACHES_MIN_SIZE;
  	else if (size > MEMCG_CACHES_MAX_SIZE)
  		size = MEMCG_CACHES_MAX_SIZE;
f3bb3043a   Vladimir Davydov   memcg: don't call...
2409
  	err = memcg_update_all_caches(size);
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2410
  	if (!err)
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2411
2412
  		err = memcg_update_all_list_lrus(size);
  	if (!err)
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2413
2414
2415
  		memcg_nr_cache_ids = size;
  
  	up_write(&memcg_cache_ids_sem);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2416
  	if (err) {
dbcf73e26   Vladimir Davydov   memcg: rename som...
2417
  		ida_simple_remove(&memcg_cache_ida, id);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2418
2419
2420
2421
2422
2423
2424
  		return err;
  	}
  	return id;
  }
  
  static void memcg_free_cache_id(int id)
  {
dbcf73e26   Vladimir Davydov   memcg: rename som...
2425
  	ida_simple_remove(&memcg_cache_ida, id);
55007d849   Glauber Costa   memcg: allocate m...
2426
  }
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2427
  struct memcg_kmem_cache_create_work {
5722d094a   Vladimir Davydov   memcg, slab: clea...
2428
2429
2430
2431
  	struct mem_cgroup *memcg;
  	struct kmem_cache *cachep;
  	struct work_struct work;
  };
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2432
  static void memcg_kmem_cache_create_func(struct work_struct *w)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2433
  {
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2434
2435
  	struct memcg_kmem_cache_create_work *cw =
  		container_of(w, struct memcg_kmem_cache_create_work, work);
5722d094a   Vladimir Davydov   memcg, slab: clea...
2436
2437
  	struct mem_cgroup *memcg = cw->memcg;
  	struct kmem_cache *cachep = cw->cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2438

d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2439
  	memcg_create_kmem_cache(memcg, cachep);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2440

5722d094a   Vladimir Davydov   memcg, slab: clea...
2441
  	css_put(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2442
2443
2444
2445
2446
  	kfree(cw);
  }
  
  /*
   * Enqueue the creation of a per-memcg kmem_cache.
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2447
   */
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2448
2449
  static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  					       struct kmem_cache *cachep)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2450
  {
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2451
  	struct memcg_kmem_cache_create_work *cw;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2452

776ed0f03   Vladimir Davydov   memcg: cleanup km...
2453
  	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2454
  	if (!cw)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2455
  		return;
8135be5a8   Vladimir Davydov   memcg: fix possib...
2456
2457
  
  	css_get(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2458
2459
2460
  
  	cw->memcg = memcg;
  	cw->cachep = cachep;
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2461
  	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2462

d7f25f8a2   Glauber Costa   memcg: infrastruc...
2463
2464
  	schedule_work(&cw->work);
  }
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2465
2466
  static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  					     struct kmem_cache *cachep)
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2467
2468
2469
2470
  {
  	/*
  	 * We need to stop accounting when we kmalloc, because if the
  	 * corresponding kmalloc cache is not yet created, the first allocation
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2471
  	 * in __memcg_schedule_kmem_cache_create will recurse.
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2472
2473
2474
2475
2476
2477
2478
  	 *
  	 * However, it is better to enclose the whole function. Depending on
  	 * the debugging options enabled, INIT_WORK(), for instance, can
  	 * trigger an allocation. This too, will make us recurse. Because at
  	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
  	 * the safest choice is to do it like this, wrapping the whole function.
  	 */
6f185c290   Vladimir Davydov   memcg: turn memcg...
2479
  	current->memcg_kmem_skip_account = 1;
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2480
  	__memcg_schedule_kmem_cache_create(memcg, cachep);
6f185c290   Vladimir Davydov   memcg: turn memcg...
2481
  	current->memcg_kmem_skip_account = 0;
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2482
  }
c67a8a685   Vladimir Davydov   memcg, slab: merg...
2483

d7f25f8a2   Glauber Costa   memcg: infrastruc...
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
  /*
   * Return the kmem_cache we're supposed to use for a slab allocation.
   * We try to use the current memcg's version of the cache.
   *
   * If the cache does not exist yet, if we are the first user of it,
   * we either create it immediately, if possible, or create it asynchronously
   * in a workqueue.
   * In the latter case, we will let the current allocation go through with
   * the original cache.
   *
   * Can't be called in interrupt context or from kernel threads.
   * This function needs to be called with rcu_read_lock() held.
   */
056b7ccef   Zhang Zhen   mm/memcontrol.c: ...
2497
  struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2498
2499
  {
  	struct mem_cgroup *memcg;
959c8963f   Vladimir Davydov   memcg, slab: fix ...
2500
  	struct kmem_cache *memcg_cachep;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2501
  	int kmemcg_id;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2502

f7ce3190c   Vladimir Davydov   slab: embed memcg...
2503
  	VM_BUG_ON(!is_root_cache(cachep));
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2504

9d100c5e4   Vladimir Davydov   memcg: don't chec...
2505
  	if (current->memcg_kmem_skip_account)
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2506
  		return cachep;
8135be5a8   Vladimir Davydov   memcg: fix possib...
2507
  	memcg = get_mem_cgroup_from_mm(current->mm);
4db0c3c29   Jason Low   mm: remove rest o...
2508
  	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2509
  	if (kmemcg_id < 0)
ca0dde971   Li Zefan   memcg: take refer...
2510
  		goto out;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2511

2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2512
  	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2513
2514
  	if (likely(memcg_cachep))
  		return memcg_cachep;
ca0dde971   Li Zefan   memcg: take refer...
2515
2516
2517
2518
2519
2520
2521
2522
2523
  
  	/*
  	 * If we are in a safe context (can wait, and not in interrupt
  	 * context), we could be be predictable and return right away.
  	 * This would guarantee that the allocation being performed
  	 * already belongs in the new cache.
  	 *
  	 * However, there are some clashes that can arrive from locking.
  	 * For instance, because we acquire the slab_mutex while doing
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2524
2525
2526
  	 * memcg_create_kmem_cache, this means no further allocation
  	 * could happen with the slab_mutex held. So it's better to
  	 * defer everything.
ca0dde971   Li Zefan   memcg: take refer...
2527
  	 */
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2528
  	memcg_schedule_kmem_cache_create(memcg, cachep);
ca0dde971   Li Zefan   memcg: take refer...
2529
  out:
8135be5a8   Vladimir Davydov   memcg: fix possib...
2530
  	css_put(&memcg->css);
ca0dde971   Li Zefan   memcg: take refer...
2531
  	return cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2532
  }
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2533

8135be5a8   Vladimir Davydov   memcg: fix possib...
2534
2535
2536
  void __memcg_kmem_put_cache(struct kmem_cache *cachep)
  {
  	if (!is_root_cache(cachep))
f7ce3190c   Vladimir Davydov   slab: embed memcg...
2537
  		css_put(&cachep->memcg_params.memcg->css);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2538
  }
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
  /*
   * We need to verify if the allocation against current->mm->owner's memcg is
   * possible for the given order. But the page is not allocated yet, so we'll
   * need a further commit step to do the final arrangements.
   *
   * It is possible for the task to switch cgroups in this mean time, so at
   * commit time, we can't rely on task conversion any longer.  We'll then use
   * the handle argument to return to the caller which cgroup we should commit
   * against. We could also return the memcg directly and avoid the pointer
   * passing, but a boolean return value gives better semantics considering
   * the compiled-out case as well.
   *
   * Returning true means the allocation is possible.
   */
  bool
  __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
  {
  	struct mem_cgroup *memcg;
  	int ret;
  
  	*_memcg = NULL;
6d42c232b   Glauber Costa   memcg: also test ...
2560

df3819754   Johannes Weiner   memcg: get_mem_cg...
2561
  	memcg = get_mem_cgroup_from_mm(current->mm);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2562

cf2b8fbf1   Vladimir Davydov   memcg: zap memcg_...
2563
  	if (!memcg_kmem_is_active(memcg)) {
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2564
2565
2566
  		css_put(&memcg->css);
  		return true;
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2567
  	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2568
2569
  	if (!ret)
  		*_memcg = memcg;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2570
2571
2572
2573
2574
2575
2576
2577
  
  	css_put(&memcg->css);
  	return (ret == 0);
  }
  
  void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
  			      int order)
  {
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2578
2579
2580
2581
  	VM_BUG_ON(mem_cgroup_is_root(memcg));
  
  	/* The page allocation failed. Revert */
  	if (!page) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2582
  		memcg_uncharge_kmem(memcg, 1 << order);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2583
2584
  		return;
  	}
1306a85ae   Johannes Weiner   mm: embed the mem...
2585
  	page->mem_cgroup = memcg;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2586
2587
2588
2589
  }
  
  void __memcg_kmem_uncharge_pages(struct page *page, int order)
  {
1306a85ae   Johannes Weiner   mm: embed the mem...
2590
  	struct mem_cgroup *memcg = page->mem_cgroup;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2591

7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2592
2593
  	if (!memcg)
  		return;
309381fea   Sasha Levin   mm: dump page whe...
2594
  	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
298333157   Johannes Weiner   mm: memcontrol: r...
2595

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2596
  	memcg_uncharge_kmem(memcg, 1 << order);
1306a85ae   Johannes Weiner   mm: embed the mem...
2597
  	page->mem_cgroup = NULL;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2598
  }
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
  
  struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
  {
  	struct mem_cgroup *memcg = NULL;
  	struct kmem_cache *cachep;
  	struct page *page;
  
  	page = virt_to_head_page(ptr);
  	if (PageSlab(page)) {
  		cachep = page->slab_cache;
  		if (!is_root_cache(cachep))
f7ce3190c   Vladimir Davydov   slab: embed memcg...
2610
  			memcg = cachep->memcg_params.memcg;
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2611
2612
2613
2614
2615
2616
  	} else
  		/* page allocated by alloc_kmem_pages */
  		memcg = page->mem_cgroup;
  
  	return memcg;
  }
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2617
  #endif /* CONFIG_MEMCG_KMEM */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2618
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2619
2620
  /*
   * Because tail pages are not marked as "used", set it. We're under
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2621
2622
2623
   * zone->lru_lock, 'splitting on pmd' and compound_lock.
   * charge/uncharge will be never happen and move_account() is done under
   * compound_lock(), so we don't have to take care of races.
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2624
   */
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2625
  void mem_cgroup_split_huge_fixup(struct page *head)
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2626
  {
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2627
  	int i;
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2628

3d37c4a91   KAMEZAWA Hiroyuki   memcg: bugfix che...
2629
2630
  	if (mem_cgroup_disabled())
  		return;
b070e65c0   David Rientjes   mm, memcg: add rs...
2631

298333157   Johannes Weiner   mm: memcontrol: r...
2632
  	for (i = 1; i < HPAGE_PMD_NR; i++)
1306a85ae   Johannes Weiner   mm: embed the mem...
2633
  		head[i].mem_cgroup = head->mem_cgroup;
b9982f8d2   Michal Hocko   mm: memcontrol: m...
2634

1306a85ae   Johannes Weiner   mm: embed the mem...
2635
  	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
b070e65c0   David Rientjes   mm, memcg: add rs...
2636
  		       HPAGE_PMD_NR);
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2637
  }
12d271078   Hugh Dickins   memcg: fix split_...
2638
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2639

c255a4580   Andrew Morton   memcg: rename con...
2640
  #ifdef CONFIG_MEMCG_SWAP
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2641
2642
  static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
  					 bool charge)
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2643
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2644
2645
  	int val = (charge) ? 1 : -1;
  	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2646
  }
024914477   Daisuke Nishimura   memcg: move charg...
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
  
  /**
   * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   * @entry: swap entry to be moved
   * @from:  mem_cgroup which the entry is moved from
   * @to:  mem_cgroup which the entry is moved to
   *
   * It succeeds only when the swap_cgroup's record for this entry is the same
   * as the mem_cgroup's id of @from.
   *
   * Returns 0 on success, -EINVAL on failure.
   *
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2659
   * The caller must have charged to @to, IOW, called page_counter_charge() about
024914477   Daisuke Nishimura   memcg: move charg...
2660
2661
2662
   * both res and memsw, and called css_get().
   */
  static int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
2663
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
2664
2665
  {
  	unsigned short old_id, new_id;
34c00c319   Li Zefan   memcg: convert to...
2666
2667
  	old_id = mem_cgroup_id(from);
  	new_id = mem_cgroup_id(to);
024914477   Daisuke Nishimura   memcg: move charg...
2668
2669
  
  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
024914477   Daisuke Nishimura   memcg: move charg...
2670
  		mem_cgroup_swap_statistics(from, false);
483c30b51   Daisuke Nishimura   memcg: improve pe...
2671
  		mem_cgroup_swap_statistics(to, true);
024914477   Daisuke Nishimura   memcg: move charg...
2672
2673
2674
2675
2676
2677
  		return 0;
  	}
  	return -EINVAL;
  }
  #else
  static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
2678
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
2679
2680
2681
  {
  	return -EINVAL;
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2682
  #endif
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2683

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2684
  static DEFINE_MUTEX(memcg_limit_mutex);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
2685

d38d2a758   KOSAKI Motohiro   mm: make mem_cgro...
2686
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2687
  				   unsigned long limit)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2688
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2689
2690
2691
  	unsigned long curusage;
  	unsigned long oldusage;
  	bool enlarge = false;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2692
  	int retry_count;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2693
  	int ret;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2694
2695
2696
2697
2698
2699
  
  	/*
  	 * For keeping hierarchical_reclaim simple, how long we should retry
  	 * is depends on callers. We set our retry-count to be function
  	 * of # of children which we should visit in this loop.
  	 */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2700
2701
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
  		      mem_cgroup_count_children(memcg);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2702

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2703
  	oldusage = page_counter_read(&memcg->memory);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2704

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2705
  	do {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2706
2707
2708
2709
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2710
2711
2712
2713
  
  		mutex_lock(&memcg_limit_mutex);
  		if (limit > memcg->memsw.limit) {
  			mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2714
  			ret = -EINVAL;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2715
2716
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2717
2718
2719
2720
  		if (limit > memcg->memory.limit)
  			enlarge = true;
  		ret = page_counter_limit(&memcg->memory, limit);
  		mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2721
2722
2723
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2724
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2725
  		curusage = page_counter_read(&memcg->memory);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2726
  		/* Usage is reduced ? */
f894ffa86   Andrew Morton   memcg: trivial cl...
2727
  		if (curusage >= oldusage)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2728
2729
2730
  			retry_count--;
  		else
  			oldusage = curusage;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2731
  	} while (retry_count);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2732
2733
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
14797e236   KOSAKI Motohiro   memcg: add inacti...
2734

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2735
2736
  	return ret;
  }
338c84310   Li Zefan   memcg: remove som...
2737
  static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2738
  					 unsigned long limit)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2739
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2740
2741
2742
  	unsigned long curusage;
  	unsigned long oldusage;
  	bool enlarge = false;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2743
  	int retry_count;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2744
  	int ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2745

81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2746
  	/* see mem_cgroup_resize_res_limit */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2747
2748
2749
2750
2751
2752
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
  		      mem_cgroup_count_children(memcg);
  
  	oldusage = page_counter_read(&memcg->memsw);
  
  	do {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2753
2754
2755
2756
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2757
2758
2759
2760
  
  		mutex_lock(&memcg_limit_mutex);
  		if (limit < memcg->memory.limit) {
  			mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2761
  			ret = -EINVAL;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2762
2763
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2764
2765
2766
2767
  		if (limit > memcg->memsw.limit)
  			enlarge = true;
  		ret = page_counter_limit(&memcg->memsw, limit);
  		mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2768
2769
2770
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2771
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2772
  		curusage = page_counter_read(&memcg->memsw);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2773
  		/* Usage is reduced ? */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2774
  		if (curusage >= oldusage)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2775
  			retry_count--;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2776
2777
  		else
  			oldusage = curusage;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2778
  	} while (retry_count);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2779
2780
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2781

628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2782
2783
  	return ret;
  }
0608f43da   Andrew Morton   revert "memcg, vm...
2784
2785
2786
2787
2788
2789
2790
2791
2792
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  					    gfp_t gfp_mask,
  					    unsigned long *total_scanned)
  {
  	unsigned long nr_reclaimed = 0;
  	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
  	unsigned long reclaimed;
  	int loop = 0;
  	struct mem_cgroup_tree_per_zone *mctz;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2793
  	unsigned long excess;
0608f43da   Andrew Morton   revert "memcg, vm...
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
  	unsigned long nr_scanned;
  
  	if (order > 0)
  		return 0;
  
  	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
  	/*
  	 * This loop can run a while, specially if mem_cgroup's continuously
  	 * keep exceeding their soft limit and putting the system under
  	 * pressure
  	 */
  	do {
  		if (next_mz)
  			mz = next_mz;
  		else
  			mz = mem_cgroup_largest_soft_limit_node(mctz);
  		if (!mz)
  			break;
  
  		nr_scanned = 0;
  		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
  						    gfp_mask, &nr_scanned);
  		nr_reclaimed += reclaimed;
  		*total_scanned += nr_scanned;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2818
  		spin_lock_irq(&mctz->lock);
bc2f2e7ff   Vladimir Davydov   memcg: simplify u...
2819
  		__mem_cgroup_remove_exceeded(mz, mctz);
0608f43da   Andrew Morton   revert "memcg, vm...
2820
2821
2822
2823
2824
2825
  
  		/*
  		 * If we failed to reclaim anything from this memory cgroup
  		 * it is time to move on to the next cgroup
  		 */
  		next_mz = NULL;
bc2f2e7ff   Vladimir Davydov   memcg: simplify u...
2826
2827
  		if (!reclaimed)
  			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2828
  		excess = soft_limit_excess(mz->memcg);
0608f43da   Andrew Morton   revert "memcg, vm...
2829
2830
2831
2832
2833
2834
2835
2836
2837
  		/*
  		 * One school of thought says that we should not add
  		 * back the node to the tree if reclaim returns 0.
  		 * But our reclaim could return 0, simply because due
  		 * to priority we are exposing a smaller subset of
  		 * memory to reclaim from. Consider this as a longer
  		 * term TODO.
  		 */
  		/* If excess == 0, no tree ops */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
2838
  		__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2839
  		spin_unlock_irq(&mctz->lock);
0608f43da   Andrew Morton   revert "memcg, vm...
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
  		css_put(&mz->memcg->css);
  		loop++;
  		/*
  		 * Could not reclaim anything and there are no more
  		 * mem cgroups to try or we seem to be looping without
  		 * reclaiming anything.
  		 */
  		if (!nr_reclaimed &&
  			(next_mz == NULL ||
  			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
  			break;
  	} while (!nr_reclaimed);
  	if (next_mz)
  		css_put(&next_mz->memcg->css);
  	return nr_reclaimed;
  }
ea280e7b4   Tejun Heo   memcg: update mem...
2856
2857
2858
2859
2860
2861
  /*
   * Test whether @memcg has children, dead or alive.  Note that this
   * function doesn't care whether @memcg has use_hierarchy enabled and
   * returns %true if there are child csses according to the cgroup
   * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
   */
b5f99b537   Glauber Costa   memcg: fast hiera...
2862
2863
  static inline bool memcg_has_children(struct mem_cgroup *memcg)
  {
ea280e7b4   Tejun Heo   memcg: update mem...
2864
  	bool ret;
696ac172f   Johannes Weiner   mm: memcg: fix te...
2865
  	/*
ea280e7b4   Tejun Heo   memcg: update mem...
2866
2867
2868
2869
  	 * The lock does not prevent addition or deletion of children, but
  	 * it prevents a new child from being initialized based on this
  	 * parent in css_online(), so it's enough to decide whether
  	 * hierarchically inherited attributes can still be changed or not.
696ac172f   Johannes Weiner   mm: memcg: fix te...
2870
  	 */
ea280e7b4   Tejun Heo   memcg: update mem...
2871
2872
2873
2874
2875
2876
  	lockdep_assert_held(&memcg_create_mutex);
  
  	rcu_read_lock();
  	ret = css_next_child(NULL, &memcg->css);
  	rcu_read_unlock();
  	return ret;
b5f99b537   Glauber Costa   memcg: fast hiera...
2877
2878
2879
  }
  
  /*
c26251f9f   Michal Hocko   memcg: split mem_...
2880
2881
2882
2883
2884
2885
2886
2887
   * Reclaims as many pages from the given memcg as possible and moves
   * the rest to the parent.
   *
   * Caller is responsible for holding css reference for memcg.
   */
  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
  {
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
c26251f9f   Michal Hocko   memcg: split mem_...
2888

c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2889
2890
  	/* we call try-to-free pages for make this cgroup empty */
  	lru_add_drain_all();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2891
  	/* try to free all pages in this cgroup */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2892
  	while (nr_retries && page_counter_read(&memcg->memory)) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2893
  		int progress;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2894

c26251f9f   Michal Hocko   memcg: split mem_...
2895
2896
  		if (signal_pending(current))
  			return -EINTR;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2897
2898
  		progress = try_to_free_mem_cgroup_pages(memcg, 1,
  							GFP_KERNEL, true);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2899
  		if (!progress) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2900
  			nr_retries--;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2901
  			/* maybe some writeback is necessary */
8aa7e847d   Jens Axboe   Fix congestion_wa...
2902
  			congestion_wait(BLK_RW_ASYNC, HZ/10);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2903
  		}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2904
2905
  
  	}
ab5196c20   Michal Hocko   memcg: make mem_c...
2906
2907
  
  	return 0;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2908
  }
6770c64e5   Tejun Heo   cgroup: replace c...
2909
2910
2911
  static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
  					    char *buf, size_t nbytes,
  					    loff_t off)
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2912
  {
6770c64e5   Tejun Heo   cgroup: replace c...
2913
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
c26251f9f   Michal Hocko   memcg: split mem_...
2914

d84230118   Michal Hocko   memcg: root_cgrou...
2915
2916
  	if (mem_cgroup_is_root(memcg))
  		return -EINVAL;
6770c64e5   Tejun Heo   cgroup: replace c...
2917
  	return mem_cgroup_force_empty(memcg) ?: nbytes;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2918
  }
182446d08   Tejun Heo   cgroup: pass arou...
2919
2920
  static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
  				     struct cftype *cft)
18f59ea7d   Balbir Singh   memcg: memory cgr...
2921
  {
182446d08   Tejun Heo   cgroup: pass arou...
2922
  	return mem_cgroup_from_css(css)->use_hierarchy;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2923
  }
182446d08   Tejun Heo   cgroup: pass arou...
2924
2925
  static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
  				      struct cftype *cft, u64 val)
18f59ea7d   Balbir Singh   memcg: memory cgr...
2926
2927
  {
  	int retval = 0;
182446d08   Tejun Heo   cgroup: pass arou...
2928
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5c9d535b8   Tejun Heo   cgroup: remove cs...
2929
  	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
2930

0999821b1   Glauber Costa   memcg: replace cg...
2931
  	mutex_lock(&memcg_create_mutex);
567fb435b   Glauber Costa   memcg: fix bad be...
2932
2933
2934
  
  	if (memcg->use_hierarchy == val)
  		goto out;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2935
  	/*
af901ca18   André Goddard Rosa   tree-wide: fix as...
2936
  	 * If parent's use_hierarchy is set, we can't make any modifications
18f59ea7d   Balbir Singh   memcg: memory cgr...
2937
2938
2939
2940
2941
2942
  	 * in the child subtrees. If it is unset, then the change can
  	 * occur, provided the current cgroup has no children.
  	 *
  	 * For the root cgroup, parent_mem is NULL, we allow value to be
  	 * set if there are no children.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2943
  	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
18f59ea7d   Balbir Singh   memcg: memory cgr...
2944
  				(val == 1 || val == 0)) {
ea280e7b4   Tejun Heo   memcg: update mem...
2945
  		if (!memcg_has_children(memcg))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2946
  			memcg->use_hierarchy = val;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2947
2948
2949
2950
  		else
  			retval = -EBUSY;
  	} else
  		retval = -EINVAL;
567fb435b   Glauber Costa   memcg: fix bad be...
2951
2952
  
  out:
0999821b1   Glauber Costa   memcg: replace cg...
2953
  	mutex_unlock(&memcg_create_mutex);
18f59ea7d   Balbir Singh   memcg: memory cgr...
2954
2955
2956
  
  	return retval;
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2957
2958
  static unsigned long tree_stat(struct mem_cgroup *memcg,
  			       enum mem_cgroup_stat_index idx)
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
  {
  	struct mem_cgroup *iter;
  	long val = 0;
  
  	/* Per-cpu values can be negative, use a signed accumulator */
  	for_each_mem_cgroup_tree(iter, memcg)
  		val += mem_cgroup_read_stat(iter, idx);
  
  	if (val < 0) /* race ? */
  		val = 0;
  	return val;
  }
  
  static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
  {
  	u64 val;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2975
2976
2977
2978
2979
2980
  	if (mem_cgroup_is_root(memcg)) {
  		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
  		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
  		if (swap)
  			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
  	} else {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2981
  		if (!swap)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2982
  			val = page_counter_read(&memcg->memory);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2983
  		else
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2984
  			val = page_counter_read(&memcg->memsw);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2985
  	}
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2986
2987
  	return val << PAGE_SHIFT;
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2988
2989
2990
2991
2992
2993
2994
  enum {
  	RES_USAGE,
  	RES_LIMIT,
  	RES_MAX_USAGE,
  	RES_FAILCNT,
  	RES_SOFT_LIMIT,
  };
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2995

791badbdb   Tejun Heo   memcg: convert aw...
2996
  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
05b843012   Johannes Weiner   mm: memcontrol: u...
2997
  			       struct cftype *cft)
8cdea7c05   Balbir Singh   Memory controller...
2998
  {
182446d08   Tejun Heo   cgroup: pass arou...
2999
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3000
  	struct page_counter *counter;
af36f906c   Tejun Heo   memcg: always cre...
3001

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3002
  	switch (MEMFILE_TYPE(cft->private)) {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3003
  	case _MEM:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3004
3005
  		counter = &memcg->memory;
  		break;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3006
  	case _MEMSWAP:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3007
3008
  		counter = &memcg->memsw;
  		break;
510fc4e11   Glauber Costa   memcg: kmem accou...
3009
  	case _KMEM:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3010
  		counter = &memcg->kmem;
510fc4e11   Glauber Costa   memcg: kmem accou...
3011
  		break;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3012
3013
  	default:
  		BUG();
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3014
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
  
  	switch (MEMFILE_ATTR(cft->private)) {
  	case RES_USAGE:
  		if (counter == &memcg->memory)
  			return mem_cgroup_usage(memcg, false);
  		if (counter == &memcg->memsw)
  			return mem_cgroup_usage(memcg, true);
  		return (u64)page_counter_read(counter) * PAGE_SIZE;
  	case RES_LIMIT:
  		return (u64)counter->limit * PAGE_SIZE;
  	case RES_MAX_USAGE:
  		return (u64)counter->watermark * PAGE_SIZE;
  	case RES_FAILCNT:
  		return counter->failcnt;
  	case RES_SOFT_LIMIT:
  		return (u64)memcg->soft_limit * PAGE_SIZE;
  	default:
  		BUG();
  	}
8cdea7c05   Balbir Singh   Memory controller...
3034
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
3035

510fc4e11   Glauber Costa   memcg: kmem accou...
3036
  #ifdef CONFIG_MEMCG_KMEM
8c0145b62   Vladimir Davydov   memcg: remove act...
3037
3038
  static int memcg_activate_kmem(struct mem_cgroup *memcg,
  			       unsigned long nr_pages)
d64416377   Vladimir Davydov   memcg: rework mem...
3039
3040
3041
  {
  	int err = 0;
  	int memcg_id;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3042
  	BUG_ON(memcg->kmemcg_id >= 0);
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3043
  	BUG_ON(memcg->kmem_acct_activated);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3044
  	BUG_ON(memcg->kmem_acct_active);
d64416377   Vladimir Davydov   memcg: rework mem...
3045
3046
  
  	/*
510fc4e11   Glauber Costa   memcg: kmem accou...
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
  	 * For simplicity, we won't allow this to be disabled.  It also can't
  	 * be changed if the cgroup has children already, or if tasks had
  	 * already joined.
  	 *
  	 * If tasks join before we set the limit, a person looking at
  	 * kmem.usage_in_bytes will have no way to determine when it took
  	 * place, which makes the value quite meaningless.
  	 *
  	 * After it first became limited, changes in the value of the limit are
  	 * of course permitted.
510fc4e11   Glauber Costa   memcg: kmem accou...
3057
  	 */
0999821b1   Glauber Costa   memcg: replace cg...
3058
  	mutex_lock(&memcg_create_mutex);
ea280e7b4   Tejun Heo   memcg: update mem...
3059
3060
  	if (cgroup_has_tasks(memcg->css.cgroup) ||
  	    (memcg->use_hierarchy && memcg_has_children(memcg)))
d64416377   Vladimir Davydov   memcg: rework mem...
3061
3062
3063
3064
  		err = -EBUSY;
  	mutex_unlock(&memcg_create_mutex);
  	if (err)
  		goto out;
510fc4e11   Glauber Costa   memcg: kmem accou...
3065

f3bb3043a   Vladimir Davydov   memcg: don't call...
3066
  	memcg_id = memcg_alloc_cache_id();
d64416377   Vladimir Davydov   memcg: rework mem...
3067
3068
3069
3070
  	if (memcg_id < 0) {
  		err = memcg_id;
  		goto out;
  	}
d64416377   Vladimir Davydov   memcg: rework mem...
3071
  	/*
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3072
3073
  	 * We couldn't have accounted to this cgroup, because it hasn't got
  	 * activated yet, so this should succeed.
d64416377   Vladimir Davydov   memcg: rework mem...
3074
  	 */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3075
  	err = page_counter_limit(&memcg->kmem, nr_pages);
d64416377   Vladimir Davydov   memcg: rework mem...
3076
3077
3078
3079
  	VM_BUG_ON(err);
  
  	static_key_slow_inc(&memcg_kmem_enabled_key);
  	/*
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3080
3081
  	 * A memory cgroup is considered kmem-active as soon as it gets
  	 * kmemcg_id. Setting the id after enabling static branching will
d64416377   Vladimir Davydov   memcg: rework mem...
3082
3083
3084
  	 * guarantee no one starts accounting before all call sites are
  	 * patched.
  	 */
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3085
  	memcg->kmemcg_id = memcg_id;
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3086
  	memcg->kmem_acct_activated = true;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3087
  	memcg->kmem_acct_active = true;
510fc4e11   Glauber Costa   memcg: kmem accou...
3088
  out:
d64416377   Vladimir Davydov   memcg: rework mem...
3089
  	return err;
d64416377   Vladimir Davydov   memcg: rework mem...
3090
  }
d64416377   Vladimir Davydov   memcg: rework mem...
3091
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3092
  				   unsigned long limit)
d64416377   Vladimir Davydov   memcg: rework mem...
3093
3094
  {
  	int ret;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3095
  	mutex_lock(&memcg_limit_mutex);
d64416377   Vladimir Davydov   memcg: rework mem...
3096
  	if (!memcg_kmem_is_active(memcg))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3097
  		ret = memcg_activate_kmem(memcg, limit);
d64416377   Vladimir Davydov   memcg: rework mem...
3098
  	else
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3099
3100
  		ret = page_counter_limit(&memcg->kmem, limit);
  	mutex_unlock(&memcg_limit_mutex);
510fc4e11   Glauber Costa   memcg: kmem accou...
3101
3102
  	return ret;
  }
55007d849   Glauber Costa   memcg: allocate m...
3103
  static int memcg_propagate_kmem(struct mem_cgroup *memcg)
510fc4e11   Glauber Costa   memcg: kmem accou...
3104
  {
55007d849   Glauber Costa   memcg: allocate m...
3105
  	int ret = 0;
510fc4e11   Glauber Costa   memcg: kmem accou...
3106
  	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
55007d849   Glauber Costa   memcg: allocate m...
3107

d64416377   Vladimir Davydov   memcg: rework mem...
3108
3109
  	if (!parent)
  		return 0;
55007d849   Glauber Costa   memcg: allocate m...
3110

8c0145b62   Vladimir Davydov   memcg: remove act...
3111
  	mutex_lock(&memcg_limit_mutex);
55007d849   Glauber Costa   memcg: allocate m...
3112
  	/*
d64416377   Vladimir Davydov   memcg: rework mem...
3113
3114
  	 * If the parent cgroup is not kmem-active now, it cannot be activated
  	 * after this point, because it has at least one child already.
55007d849   Glauber Costa   memcg: allocate m...
3115
  	 */
d64416377   Vladimir Davydov   memcg: rework mem...
3116
  	if (memcg_kmem_is_active(parent))
8c0145b62   Vladimir Davydov   memcg: remove act...
3117
3118
  		ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
  	mutex_unlock(&memcg_limit_mutex);
55007d849   Glauber Costa   memcg: allocate m...
3119
  	return ret;
510fc4e11   Glauber Costa   memcg: kmem accou...
3120
  }
d64416377   Vladimir Davydov   memcg: rework mem...
3121
3122
  #else
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3123
  				   unsigned long limit)
d64416377   Vladimir Davydov   memcg: rework mem...
3124
3125
3126
  {
  	return -EINVAL;
  }
6d0439904   Hugh Dickins   memcg: stop warni...
3127
  #endif /* CONFIG_MEMCG_KMEM */
510fc4e11   Glauber Costa   memcg: kmem accou...
3128

628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3129
3130
3131
3132
  /*
   * The user of this function is...
   * RES_LIMIT.
   */
451af504d   Tejun Heo   cgroup: replace c...
3133
3134
  static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
8cdea7c05   Balbir Singh   Memory controller...
3135
  {
451af504d   Tejun Heo   cgroup: replace c...
3136
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3137
  	unsigned long nr_pages;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3138
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
3139
  	buf = strstrip(buf);
650c5e565   Johannes Weiner   mm: page_counter:...
3140
  	ret = page_counter_memparse(buf, "-1", &nr_pages);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3141
3142
  	if (ret)
  		return ret;
af36f906c   Tejun Heo   memcg: always cre...
3143

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3144
  	switch (MEMFILE_ATTR(of_cft(of)->private)) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3145
  	case RES_LIMIT:
4b3bde4c9   Balbir Singh   memcg: remove the...
3146
3147
3148
3149
  		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
  			ret = -EINVAL;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3150
3151
3152
  		switch (MEMFILE_TYPE(of_cft(of)->private)) {
  		case _MEM:
  			ret = mem_cgroup_resize_limit(memcg, nr_pages);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3153
  			break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3154
3155
  		case _MEMSWAP:
  			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
296c81d89   Balbir Singh   memory controller...
3156
  			break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3157
3158
3159
3160
  		case _KMEM:
  			ret = memcg_update_kmem_limit(memcg, nr_pages);
  			break;
  		}
296c81d89   Balbir Singh   memory controller...
3161
  		break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3162
3163
3164
  	case RES_SOFT_LIMIT:
  		memcg->soft_limit = nr_pages;
  		ret = 0;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3165
3166
  		break;
  	}
451af504d   Tejun Heo   cgroup: replace c...
3167
  	return ret ?: nbytes;
8cdea7c05   Balbir Singh   Memory controller...
3168
  }
6770c64e5   Tejun Heo   cgroup: replace c...
3169
3170
  static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
  				size_t nbytes, loff_t off)
c84872e16   Pavel Emelyanov   memcgroup: add th...
3171
  {
6770c64e5   Tejun Heo   cgroup: replace c...
3172
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3173
  	struct page_counter *counter;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3174

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
  	switch (MEMFILE_TYPE(of_cft(of)->private)) {
  	case _MEM:
  		counter = &memcg->memory;
  		break;
  	case _MEMSWAP:
  		counter = &memcg->memsw;
  		break;
  	case _KMEM:
  		counter = &memcg->kmem;
  		break;
  	default:
  		BUG();
  	}
af36f906c   Tejun Heo   memcg: always cre...
3188

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3189
  	switch (MEMFILE_ATTR(of_cft(of)->private)) {
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3190
  	case RES_MAX_USAGE:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3191
  		page_counter_reset_watermark(counter);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3192
3193
  		break;
  	case RES_FAILCNT:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3194
  		counter->failcnt = 0;
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3195
  		break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3196
3197
  	default:
  		BUG();
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3198
  	}
f64c3f549   Balbir Singh   memory controller...
3199

6770c64e5   Tejun Heo   cgroup: replace c...
3200
  	return nbytes;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3201
  }
182446d08   Tejun Heo   cgroup: pass arou...
3202
  static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
3203
3204
  					struct cftype *cft)
  {
182446d08   Tejun Heo   cgroup: pass arou...
3205
  	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
3206
  }
024914477   Daisuke Nishimura   memcg: move charg...
3207
  #ifdef CONFIG_MMU
182446d08   Tejun Heo   cgroup: pass arou...
3208
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
3209
3210
  					struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
3211
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7dc74be03   Daisuke Nishimura   memcg: add interf...
3212

1dfab5abc   Johannes Weiner   mm: memcontrol: f...
3213
  	if (val & ~MOVE_MASK)
7dc74be03   Daisuke Nishimura   memcg: add interf...
3214
  		return -EINVAL;
ee5e8472b   Glauber Costa   memcg: prevent ch...
3215

7dc74be03   Daisuke Nishimura   memcg: add interf...
3216
  	/*
ee5e8472b   Glauber Costa   memcg: prevent ch...
3217
3218
3219
3220
  	 * No kind of locking is needed in here, because ->can_attach() will
  	 * check this value once in the beginning of the process, and then carry
  	 * on with stale data. This means that changes to this value will only
  	 * affect task migrations starting after the change.
7dc74be03   Daisuke Nishimura   memcg: add interf...
3221
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3222
  	memcg->move_charge_at_immigrate = val;
7dc74be03   Daisuke Nishimura   memcg: add interf...
3223
3224
  	return 0;
  }
024914477   Daisuke Nishimura   memcg: move charg...
3225
  #else
182446d08   Tejun Heo   cgroup: pass arou...
3226
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
024914477   Daisuke Nishimura   memcg: move charg...
3227
3228
3229
3230
3231
  					struct cftype *cft, u64 val)
  {
  	return -ENOSYS;
  }
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
3232

406eb0c9b   Ying Han   memcg: add memory...
3233
  #ifdef CONFIG_NUMA
2da8ca822   Tejun Heo   cgroup: replace c...
3234
  static int memcg_numa_stat_show(struct seq_file *m, void *v)
406eb0c9b   Ying Han   memcg: add memory...
3235
  {
25485de6e   Greg Thelen   memcg: refactor m...
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
  	struct numa_stat {
  		const char *name;
  		unsigned int lru_mask;
  	};
  
  	static const struct numa_stat stats[] = {
  		{ "total", LRU_ALL },
  		{ "file", LRU_ALL_FILE },
  		{ "anon", LRU_ALL_ANON },
  		{ "unevictable", BIT(LRU_UNEVICTABLE) },
  	};
  	const struct numa_stat *stat;
406eb0c9b   Ying Han   memcg: add memory...
3248
  	int nid;
25485de6e   Greg Thelen   memcg: refactor m...
3249
  	unsigned long nr;
2da8ca822   Tejun Heo   cgroup: replace c...
3250
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
406eb0c9b   Ying Han   memcg: add memory...
3251

25485de6e   Greg Thelen   memcg: refactor m...
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
  		seq_printf(m, "%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
  							  stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
3262
  	}
406eb0c9b   Ying Han   memcg: add memory...
3263

071aee138   Ying Han   memcg: support hi...
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		struct mem_cgroup *iter;
  
  		nr = 0;
  		for_each_mem_cgroup_tree(iter, memcg)
  			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
  		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = 0;
  			for_each_mem_cgroup_tree(iter, memcg)
  				nr += mem_cgroup_node_nr_lru_pages(
  					iter, nid, stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
3280
  	}
406eb0c9b   Ying Han   memcg: add memory...
3281

406eb0c9b   Ying Han   memcg: add memory...
3282
3283
3284
  	return 0;
  }
  #endif /* CONFIG_NUMA */
2da8ca822   Tejun Heo   cgroup: replace c...
3285
  static int memcg_stat_show(struct seq_file *m, void *v)
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3286
  {
2da8ca822   Tejun Heo   cgroup: replace c...
3287
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3288
  	unsigned long memory, memsw;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3289
3290
  	struct mem_cgroup *mi;
  	unsigned int i;
406eb0c9b   Ying Han   memcg: add memory...
3291

0ca44b148   Greg Thelen   memcg: add BUILD_...
3292
3293
3294
3295
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
  		     MEM_CGROUP_STAT_NSTATS);
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
  		     MEM_CGROUP_EVENTS_NSTATS);
70bc068c4   Rickard Strandqvist   mm/memcontrol.c: ...
3296
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3297
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
3298
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3299
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3300
3301
3302
  		seq_printf(m, "%s %ld
  ", mem_cgroup_stat_names[i],
  			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3303
  	}
7b854121e   Lee Schermerhorn   Unevictable LRU P...
3304

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3305
3306
3307
3308
3309
3310
3311
3312
3313
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_events_names[i],
  			   mem_cgroup_read_events(memcg, i));
  
  	for (i = 0; i < NR_LRU_LISTS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_lru_names[i],
  			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3314
  	/* Hierarchical information */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3315
3316
3317
3318
  	memory = memsw = PAGE_COUNTER_MAX;
  	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
  		memory = min(memory, mi->memory.limit);
  		memsw = min(memsw, mi->memsw.limit);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3319
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3320
3321
3322
3323
3324
3325
3326
  	seq_printf(m, "hierarchical_memory_limit %llu
  ",
  		   (u64)memory * PAGE_SIZE);
  	if (do_swap_account)
  		seq_printf(m, "hierarchical_memsw_limit %llu
  ",
  			   (u64)memsw * PAGE_SIZE);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3327

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3328
3329
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
  		long long val = 0;
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
3330
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3331
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
  		seq_printf(m, "total_%s %lld
  ", mem_cgroup_stat_names[i], val);
  	}
  
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_events(mi, i);
  		seq_printf(m, "total_%s %llu
  ",
  			   mem_cgroup_events_names[i], val);
  	}
  
  	for (i = 0; i < NR_LRU_LISTS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
  		seq_printf(m, "total_%s %llu
  ", mem_cgroup_lru_names[i], val);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3355
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3356

7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3357
  #ifdef CONFIG_DEBUG_VM
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3358
3359
3360
  	{
  		int nid, zid;
  		struct mem_cgroup_per_zone *mz;
89abfab13   Hugh Dickins   mm/memcg: move re...
3361
  		struct zone_reclaim_stat *rstat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3362
3363
3364
3365
3366
  		unsigned long recent_rotated[2] = {0, 0};
  		unsigned long recent_scanned[2] = {0, 0};
  
  		for_each_online_node(nid)
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
3367
  				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
89abfab13   Hugh Dickins   mm/memcg: move re...
3368
  				rstat = &mz->lruvec.reclaim_stat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3369

89abfab13   Hugh Dickins   mm/memcg: move re...
3370
3371
3372
3373
  				recent_rotated[0] += rstat->recent_rotated[0];
  				recent_rotated[1] += rstat->recent_rotated[1];
  				recent_scanned[0] += rstat->recent_scanned[0];
  				recent_scanned[1] += rstat->recent_scanned[1];
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3374
  			}
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
3375
3376
3377
3378
3379
3380
3381
3382
  		seq_printf(m, "recent_rotated_anon %lu
  ", recent_rotated[0]);
  		seq_printf(m, "recent_rotated_file %lu
  ", recent_rotated[1]);
  		seq_printf(m, "recent_scanned_anon %lu
  ", recent_scanned[0]);
  		seq_printf(m, "recent_scanned_file %lu
  ", recent_scanned[1]);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3383
3384
  	}
  #endif
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3385
3386
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
3387
3388
  static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
  				      struct cftype *cft)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3389
  {
182446d08   Tejun Heo   cgroup: pass arou...
3390
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3391

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
3392
  	return mem_cgroup_swappiness(memcg);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3393
  }
182446d08   Tejun Heo   cgroup: pass arou...
3394
3395
  static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
  				       struct cftype *cft, u64 val)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3396
  {
182446d08   Tejun Heo   cgroup: pass arou...
3397
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3398

3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3399
  	if (val > 100)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3400
  		return -EINVAL;
14208b0ec   Linus Torvalds   Merge branch 'for...
3401
  	if (css->parent)
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3402
3403
3404
  		memcg->swappiness = val;
  	else
  		vm_swappiness = val;
068b38c1f   Li Zefan   memcg: fix a race...
3405

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3406
3407
  	return 0;
  }
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3408
3409
3410
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
  	struct mem_cgroup_threshold_ary *t;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3411
  	unsigned long usage;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3412
3413
3414
3415
  	int i;
  
  	rcu_read_lock();
  	if (!swap)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3416
  		t = rcu_dereference(memcg->thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3417
  	else
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3418
  		t = rcu_dereference(memcg->memsw_thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3419
3420
3421
  
  	if (!t)
  		goto unlock;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3422
  	usage = mem_cgroup_usage(memcg, swap);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3423
3424
  
  	/*
748dad36d   Sha Zhengju   memcg: make thres...
3425
  	 * current_threshold points to threshold just below or equal to usage.
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3426
3427
3428
  	 * If it's not true, a threshold was crossed after last
  	 * call of __mem_cgroup_threshold().
  	 */
5407a5625   Phil Carmody   mm: remove unnece...
3429
  	i = t->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
  
  	/*
  	 * Iterate backward over array of thresholds starting from
  	 * current_threshold and check if a threshold is crossed.
  	 * If none of thresholds below usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* i = current_threshold + 1 */
  	i++;
  
  	/*
  	 * Iterate forward over array of thresholds starting from
  	 * current_threshold+1 and check if a threshold is crossed.
  	 * If none of thresholds above usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* Update current_threshold */
5407a5625   Phil Carmody   mm: remove unnece...
3453
  	t->current_threshold = i - 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3454
3455
3456
3457
3458
3459
  unlock:
  	rcu_read_unlock();
  }
  
  static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  {
ad4ca5f4b   Kirill A. Shutemov   memcg: fix thresh...
3460
3461
3462
3463
3464
3465
3466
  	while (memcg) {
  		__mem_cgroup_threshold(memcg, false);
  		if (do_swap_account)
  			__mem_cgroup_threshold(memcg, true);
  
  		memcg = parent_mem_cgroup(memcg);
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3467
3468
3469
3470
3471
3472
  }
  
  static int compare_thresholds(const void *a, const void *b)
  {
  	const struct mem_cgroup_threshold *_a = a;
  	const struct mem_cgroup_threshold *_b = b;
2bff24a37   Greg Thelen   memcg: fix multip...
3473
3474
3475
3476
3477
3478
3479
  	if (_a->threshold > _b->threshold)
  		return 1;
  
  	if (_a->threshold < _b->threshold)
  		return -1;
  
  	return 0;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3480
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3481
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3482
3483
  {
  	struct mem_cgroup_eventfd_list *ev;
2bcf2e92c   Michal Hocko   memcg: oom_notify...
3484
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
3485
  	list_for_each_entry(ev, &memcg->oom_notify, list)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3486
  		eventfd_signal(ev->eventfd, 1);
2bcf2e92c   Michal Hocko   memcg: oom_notify...
3487
3488
  
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3489
3490
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3491
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3492
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3493
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3494
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3495
  		mem_cgroup_oom_notify_cb(iter);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3496
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3497
  static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3498
  	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3499
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3500
3501
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3502
3503
  	unsigned long threshold;
  	unsigned long usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3504
  	int i, size, ret;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3505

650c5e565   Johannes Weiner   mm: page_counter:...
3506
  	ret = page_counter_memparse(args, "-1", &threshold);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3507
3508
  	if (ret)
  		return ret;
2105f9aec   Shaohua Li   memcg: convert th...
3509
  	threshold <<= PAGE_SHIFT;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3510
3511
  
  	mutex_lock(&memcg->thresholds_lock);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3512

05b843012   Johannes Weiner   mm: memcontrol: u...
3513
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3514
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3515
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
3516
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3517
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3518
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
3519
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3520
  		BUG();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3521
  	/* Check if a threshold crossed before adding a new one */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3522
  	if (thresholds->primary)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3523
  		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3524
  	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3525
3526
  
  	/* Allocate memory for new array of thresholds */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3527
  	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3528
  			GFP_KERNEL);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3529
  	if (!new) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3530
3531
3532
  		ret = -ENOMEM;
  		goto unlock;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3533
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3534
3535
  
  	/* Copy thresholds (if any) to new array */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3536
3537
  	if (thresholds->primary) {
  		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3538
  				sizeof(struct mem_cgroup_threshold));
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3539
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3540
  	/* Add new threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3541
3542
  	new->entries[size - 1].eventfd = eventfd;
  	new->entries[size - 1].threshold = threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3543
3544
  
  	/* Sort thresholds. Registering of new threshold isn't time-critical */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3545
  	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3546
3547
3548
  			compare_thresholds, NULL);
  
  	/* Find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3549
  	new->current_threshold = -1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3550
  	for (i = 0; i < size; i++) {
748dad36d   Sha Zhengju   memcg: make thres...
3551
  		if (new->entries[i].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3552
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3553
3554
  			 * new->current_threshold will not be used until
  			 * rcu_assign_pointer(), so it's safe to increment
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3555
3556
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3557
  			++new->current_threshold;
748dad36d   Sha Zhengju   memcg: make thres...
3558
3559
  		} else
  			break;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3560
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3561
3562
3563
3564
3565
  	/* Free old spare buffer and save old primary buffer as spare */
  	kfree(thresholds->spare);
  	thresholds->spare = thresholds->primary;
  
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3566

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3567
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3568
  	synchronize_rcu();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3569
3570
3571
3572
3573
  unlock:
  	mutex_unlock(&memcg->thresholds_lock);
  
  	return ret;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3574
  static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3575
3576
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3577
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
3578
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3579
  static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3580
3581
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3582
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
3583
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3584
  static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3585
  	struct eventfd_ctx *eventfd, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3586
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3587
3588
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3589
  	unsigned long usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3590
  	int i, j, size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3591
3592
  
  	mutex_lock(&memcg->thresholds_lock);
05b843012   Johannes Weiner   mm: memcontrol: u...
3593
3594
  
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3595
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3596
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
3597
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3598
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3599
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
3600
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3601
  		BUG();
371528cae   Anton Vorontsov   mm: memcg: Correc...
3602
3603
  	if (!thresholds->primary)
  		goto unlock;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3604
3605
3606
3607
  	/* Check if a threshold crossed before removing */
  	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
  
  	/* Calculate new number of threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3608
3609
3610
  	size = 0;
  	for (i = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd != eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3611
3612
  			size++;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3613
  	new = thresholds->spare;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3614

2e72b6347   Kirill A. Shutemov   memcg: implement ...
3615
3616
  	/* Set thresholds array to NULL if we don't have thresholds */
  	if (!size) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3617
3618
  		kfree(new);
  		new = NULL;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3619
  		goto swap_buffers;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3620
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3621
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3622
3623
  
  	/* Copy thresholds and find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3624
3625
3626
  	new->current_threshold = -1;
  	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd == eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3627
  			continue;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3628
  		new->entries[j] = thresholds->primary->entries[i];
748dad36d   Sha Zhengju   memcg: make thres...
3629
  		if (new->entries[j].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3630
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3631
  			 * new->current_threshold will not be used
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3632
3633
3634
  			 * until rcu_assign_pointer(), so it's safe to increment
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3635
  			++new->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3636
3637
3638
  		}
  		j++;
  	}
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3639
  swap_buffers:
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3640
3641
  	/* Swap primary and spare array */
  	thresholds->spare = thresholds->primary;
8c7577637   Sha Zhengju   memcg: free spare...
3642
3643
3644
3645
3646
  	/* If all events are unregistered, free the spare array */
  	if (!new) {
  		kfree(thresholds->spare);
  		thresholds->spare = NULL;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3647
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3648

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3649
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3650
  	synchronize_rcu();
371528cae   Anton Vorontsov   mm: memcg: Correc...
3651
  unlock:
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3652
  	mutex_unlock(&memcg->thresholds_lock);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3653
  }
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3654

59b6f8734   Tejun Heo   memcg: make cgrou...
3655
  static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3656
3657
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3658
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
3659
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3660
  static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3661
3662
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3663
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
3664
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3665
  static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3666
  	struct eventfd_ctx *eventfd, const char *args)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3667
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3668
  	struct mem_cgroup_eventfd_list *event;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3669

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3670
3671
3672
  	event = kmalloc(sizeof(*event),	GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
1af8efe96   Michal Hocko   memcg: change mem...
3673
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3674
3675
3676
3677
3678
  
  	event->eventfd = eventfd;
  	list_add(&event->list, &memcg->oom_notify);
  
  	/* already in OOM ? */
79dfdaccd   Michal Hocko   memcg: make oom_l...
3679
  	if (atomic_read(&memcg->under_oom))
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3680
  		eventfd_signal(eventfd, 1);
1af8efe96   Michal Hocko   memcg: change mem...
3681
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3682
3683
3684
  
  	return 0;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3685
  static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3686
  	struct eventfd_ctx *eventfd)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3687
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3688
  	struct mem_cgroup_eventfd_list *ev, *tmp;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3689

1af8efe96   Michal Hocko   memcg: change mem...
3690
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3691

c0ff4b854   Raghavendra K T   memcg: rename mem...
3692
  	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3693
3694
3695
3696
3697
  		if (ev->eventfd == eventfd) {
  			list_del(&ev->list);
  			kfree(ev);
  		}
  	}
1af8efe96   Michal Hocko   memcg: change mem...
3698
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3699
  }
2da8ca822   Tejun Heo   cgroup: replace c...
3700
  static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3701
  {
2da8ca822   Tejun Heo   cgroup: replace c...
3702
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3703

791badbdb   Tejun Heo   memcg: convert aw...
3704
3705
3706
3707
  	seq_printf(sf, "oom_kill_disable %d
  ", memcg->oom_kill_disable);
  	seq_printf(sf, "under_oom %d
  ", (bool)atomic_read(&memcg->under_oom));
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3708
3709
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
3710
  static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3711
3712
  	struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
3713
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3714
3715
  
  	/* cannot set to root cgroup and only 0 and 1 are allowed */
14208b0ec   Linus Torvalds   Merge branch 'for...
3716
  	if (!css->parent || !((val == 0) || (val == 1)))
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3717
  		return -EINVAL;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3718
  	memcg->oom_kill_disable = val;
4d845ebf4   KAMEZAWA Hiroyuki   memcg: fix wake u...
3719
  	if (!val)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3720
  		memcg_oom_recover(memcg);
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3721

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3722
3723
  	return 0;
  }
c255a4580   Andrew Morton   memcg: rename con...
3724
  #ifdef CONFIG_MEMCG_KMEM
cbe128e34   Glauber Costa   cgroup: get rid o...
3725
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
3726
  {
55007d849   Glauber Costa   memcg: allocate m...
3727
  	int ret;
55007d849   Glauber Costa   memcg: allocate m...
3728
3729
3730
  	ret = memcg_propagate_kmem(memcg);
  	if (ret)
  		return ret;
2633d7a02   Glauber Costa   slab/slub: consid...
3731

1d62e4365   Glauber Costa   cgroup: pass stru...
3732
  	return mem_cgroup_sockets_init(memcg, ss);
573b400d0   Michel Lespinasse   mm/memcontrol.c: ...
3733
  }
e5671dfae   Glauber Costa   Basic kernel memo...
3734

2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3735
3736
  static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
  {
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3737
3738
3739
  	struct cgroup_subsys_state *css;
  	struct mem_cgroup *parent, *child;
  	int kmemcg_id;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
  	if (!memcg->kmem_acct_active)
  		return;
  
  	/*
  	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
  	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
  	 * guarantees no cache will be created for this cgroup after we are
  	 * done (see memcg_create_kmem_cache()).
  	 */
  	memcg->kmem_acct_active = false;
  
  	memcg_deactivate_kmem_caches(memcg);
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
  
  	kmemcg_id = memcg->kmemcg_id;
  	BUG_ON(kmemcg_id < 0);
  
  	parent = parent_mem_cgroup(memcg);
  	if (!parent)
  		parent = root_mem_cgroup;
  
  	/*
  	 * Change kmemcg_id of this cgroup and all its descendants to the
  	 * parent's id, and then move all entries from this cgroup's list_lrus
  	 * to ones of the parent. After we have finished, all list_lrus
  	 * corresponding to this cgroup are guaranteed to remain empty. The
  	 * ordering is imposed by list_lru_node->lock taken by
  	 * memcg_drain_all_list_lrus().
  	 */
  	css_for_each_descendant_pre(css, &memcg->css) {
  		child = mem_cgroup_from_css(css);
  		BUG_ON(child->kmemcg_id != kmemcg_id);
  		child->kmemcg_id = parent->kmemcg_id;
  		if (!memcg->use_hierarchy)
  			break;
  	}
  	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
  
  	memcg_free_cache_id(kmemcg_id);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3778
  }
10d5ebf40   Li Zefan   memcg: use css_ge...
3779
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
d1a4c0b37   Glauber Costa   tcp memory pressu...
3780
  {
f48b80a5e   Vladimir Davydov   memcg: cleanup st...
3781
3782
3783
3784
3785
  	if (memcg->kmem_acct_activated) {
  		memcg_destroy_kmem_caches(memcg);
  		static_key_slow_dec(&memcg_kmem_enabled_key);
  		WARN_ON(page_counter_read(&memcg->kmem));
  	}
1d62e4365   Glauber Costa   cgroup: pass stru...
3786
  	mem_cgroup_sockets_destroy(memcg);
10d5ebf40   Li Zefan   memcg: use css_ge...
3787
  }
e5671dfae   Glauber Costa   Basic kernel memo...
3788
  #else
cbe128e34   Glauber Costa   cgroup: get rid o...
3789
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
3790
3791
3792
  {
  	return 0;
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
3793

2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3794
3795
3796
  static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
  {
  }
10d5ebf40   Li Zefan   memcg: use css_ge...
3797
3798
3799
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
  {
  }
e5671dfae   Glauber Costa   Basic kernel memo...
3800
  #endif
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3801
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
   * DO NOT USE IN NEW FILES.
   *
   * "cgroup.event_control" implementation.
   *
   * This is way over-engineered.  It tries to support fully configurable
   * events for each user.  Such level of flexibility is completely
   * unnecessary especially in the light of the planned unified hierarchy.
   *
   * Please deprecate this and replace with something simpler if at all
   * possible.
   */
  
  /*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3815
3816
3817
3818
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
3819
  static void memcg_event_remove(struct work_struct *work)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3820
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3821
3822
  	struct mem_cgroup_event *event =
  		container_of(work, struct mem_cgroup_event, remove);
59b6f8734   Tejun Heo   memcg: make cgrou...
3823
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3824
3825
  
  	remove_wait_queue(event->wqh, &event->wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
3826
  	event->unregister_event(memcg, event->eventfd);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3827
3828
3829
3830
3831
3832
  
  	/* Notify userspace the event is going away. */
  	eventfd_signal(event->eventfd, 1);
  
  	eventfd_ctx_put(event->eventfd);
  	kfree(event);
59b6f8734   Tejun Heo   memcg: make cgrou...
3833
  	css_put(&memcg->css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3834
3835
3836
3837
3838
3839
3840
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
3841
3842
  static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
  			    int sync, void *key)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3843
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3844
3845
  	struct mem_cgroup_event *event =
  		container_of(wait, struct mem_cgroup_event, wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
3846
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
  		/*
  		 * If the event has been detached at cgroup removal, we
  		 * can simply return knowing the other side will cleanup
  		 * for us.
  		 *
  		 * We can't race against event freeing since the other
  		 * side will require wqh->lock via remove_wait_queue(),
  		 * which we hold.
  		 */
fba948078   Tejun Heo   cgroup, memcg: mo...
3859
  		spin_lock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3860
3861
3862
3863
3864
3865
3866
3867
  		if (!list_empty(&event->list)) {
  			list_del_init(&event->list);
  			/*
  			 * We are in atomic context, but cgroup_event_remove()
  			 * may sleep, so we have to call it in workqueue.
  			 */
  			schedule_work(&event->remove);
  		}
fba948078   Tejun Heo   cgroup, memcg: mo...
3868
  		spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3869
3870
3871
3872
  	}
  
  	return 0;
  }
3bc942f37   Tejun Heo   memcg: rename cgr...
3873
  static void memcg_event_ptable_queue_proc(struct file *file,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3874
3875
  		wait_queue_head_t *wqh, poll_table *pt)
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3876
3877
  	struct mem_cgroup_event *event =
  		container_of(pt, struct mem_cgroup_event, pt);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3878
3879
3880
3881
3882
3883
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
3884
3885
   * DO NOT USE IN NEW FILES.
   *
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3886
3887
3888
3889
3890
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
451af504d   Tejun Heo   cgroup: replace c...
3891
3892
  static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
  					 char *buf, size_t nbytes, loff_t off)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3893
  {
451af504d   Tejun Heo   cgroup: replace c...
3894
  	struct cgroup_subsys_state *css = of_css(of);
fba948078   Tejun Heo   cgroup, memcg: mo...
3895
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
3896
  	struct mem_cgroup_event *event;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3897
3898
3899
3900
  	struct cgroup_subsys_state *cfile_css;
  	unsigned int efd, cfd;
  	struct fd efile;
  	struct fd cfile;
fba948078   Tejun Heo   cgroup, memcg: mo...
3901
  	const char *name;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3902
3903
  	char *endp;
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
3904
3905
3906
  	buf = strstrip(buf);
  
  	efd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3907
3908
  	if (*endp != ' ')
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
3909
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3910

451af504d   Tejun Heo   cgroup: replace c...
3911
  	cfd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3912
3913
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
3914
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3915
3916
3917
3918
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
59b6f8734   Tejun Heo   memcg: make cgrou...
3919
  	event->memcg = memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3920
  	INIT_LIST_HEAD(&event->list);
3bc942f37   Tejun Heo   memcg: rename cgr...
3921
3922
3923
  	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
  	INIT_WORK(&event->remove, memcg_event_remove);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
  
  	efile = fdget(efd);
  	if (!efile.file) {
  		ret = -EBADF;
  		goto out_kfree;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile.file);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto out_put_efile;
  	}
  
  	cfile = fdget(cfd);
  	if (!cfile.file) {
  		ret = -EBADF;
  		goto out_put_eventfd;
  	}
  
  	/* the process need read permission on control file */
  	/* AV: shouldn't we check that it's been opened for read instead? */
  	ret = inode_permission(file_inode(cfile.file), MAY_READ);
  	if (ret < 0)
  		goto out_put_cfile;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3948
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
3949
3950
3951
3952
  	 * Determine the event callbacks and set them in @event.  This used
  	 * to be done via struct cftype but cgroup core no longer knows
  	 * about these events.  The following is crude but the whole thing
  	 * is for compatibility anyway.
3bc942f37   Tejun Heo   memcg: rename cgr...
3953
3954
  	 *
  	 * DO NOT ADD NEW FILES.
fba948078   Tejun Heo   cgroup, memcg: mo...
3955
  	 */
b583043e9   Al Viro   kill f_dentry uses
3956
  	name = cfile.file->f_path.dentry->d_name.name;
fba948078   Tejun Heo   cgroup, memcg: mo...
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
  
  	if (!strcmp(name, "memory.usage_in_bytes")) {
  		event->register_event = mem_cgroup_usage_register_event;
  		event->unregister_event = mem_cgroup_usage_unregister_event;
  	} else if (!strcmp(name, "memory.oom_control")) {
  		event->register_event = mem_cgroup_oom_register_event;
  		event->unregister_event = mem_cgroup_oom_unregister_event;
  	} else if (!strcmp(name, "memory.pressure_level")) {
  		event->register_event = vmpressure_register_event;
  		event->unregister_event = vmpressure_unregister_event;
  	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
347c4a874   Tejun Heo   memcg: remove cgr...
3968
3969
  		event->register_event = memsw_cgroup_usage_register_event;
  		event->unregister_event = memsw_cgroup_usage_unregister_event;
fba948078   Tejun Heo   cgroup, memcg: mo...
3970
3971
3972
3973
3974
3975
  	} else {
  		ret = -EINVAL;
  		goto out_put_cfile;
  	}
  
  	/*
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
3976
3977
3978
  	 * Verify @cfile should belong to @css.  Also, remaining events are
  	 * automatically removed on cgroup destruction but the removal is
  	 * asynchronous, so take an extra ref on @css.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3979
  	 */
b583043e9   Al Viro   kill f_dentry uses
3980
  	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
ec903c0c8   Tejun Heo   cgroup: rename cs...
3981
  					       &memory_cgrp_subsys);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3982
  	ret = -EINVAL;
5a17f543e   Tejun Heo   cgroup: improve c...
3983
  	if (IS_ERR(cfile_css))
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3984
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
3985
3986
  	if (cfile_css != css) {
  		css_put(cfile_css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3987
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
3988
  	}
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3989

451af504d   Tejun Heo   cgroup: replace c...
3990
  	ret = event->register_event(memcg, event->eventfd, buf);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3991
3992
3993
3994
  	if (ret)
  		goto out_put_css;
  
  	efile.file->f_op->poll(efile.file, &event->pt);
fba948078   Tejun Heo   cgroup, memcg: mo...
3995
3996
3997
  	spin_lock(&memcg->event_list_lock);
  	list_add(&event->list, &memcg->event_list);
  	spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3998
3999
4000
  
  	fdput(cfile);
  	fdput(efile);
451af504d   Tejun Heo   cgroup: replace c...
4001
  	return nbytes;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4002
4003
  
  out_put_css:
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
4004
  	css_put(css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
  out_put_cfile:
  	fdput(cfile);
  out_put_eventfd:
  	eventfd_ctx_put(event->eventfd);
  out_put_efile:
  	fdput(efile);
  out_kfree:
  	kfree(event);
  
  	return ret;
  }
241994ed8   Johannes Weiner   mm: memcontrol: d...
4016
  static struct cftype mem_cgroup_legacy_files[] = {
8cdea7c05   Balbir Singh   Memory controller...
4017
  	{
0eea10301   Balbir Singh   Memory controller...
4018
  		.name = "usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4019
  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
4020
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4021
4022
  	},
  	{
c84872e16   Pavel Emelyanov   memcgroup: add th...
4023
  		.name = "max_usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4024
  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
4025
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4026
  		.read_u64 = mem_cgroup_read_u64,
c84872e16   Pavel Emelyanov   memcgroup: add th...
4027
4028
  	},
  	{
0eea10301   Balbir Singh   Memory controller...
4029
  		.name = "limit_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4030
  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4031
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4032
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4033
4034
  	},
  	{
296c81d89   Balbir Singh   memory controller...
4035
4036
  		.name = "soft_limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4037
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4038
  		.read_u64 = mem_cgroup_read_u64,
296c81d89   Balbir Singh   memory controller...
4039
4040
  	},
  	{
8cdea7c05   Balbir Singh   Memory controller...
4041
  		.name = "failcnt",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4042
  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
4043
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4044
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4045
  	},
8697d3319   Balbir Singh   Memory controller...
4046
  	{
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4047
  		.name = "stat",
2da8ca822   Tejun Heo   cgroup: replace c...
4048
  		.seq_show = memcg_stat_show,
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4049
  	},
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4050
4051
  	{
  		.name = "force_empty",
6770c64e5   Tejun Heo   cgroup: replace c...
4052
  		.write = mem_cgroup_force_empty_write,
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4053
  	},
18f59ea7d   Balbir Singh   memcg: memory cgr...
4054
4055
4056
4057
4058
  	{
  		.name = "use_hierarchy",
  		.write_u64 = mem_cgroup_hierarchy_write,
  		.read_u64 = mem_cgroup_hierarchy_read,
  	},
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4059
  	{
3bc942f37   Tejun Heo   memcg: rename cgr...
4060
  		.name = "cgroup.event_control",		/* XXX: for compat */
451af504d   Tejun Heo   cgroup: replace c...
4061
  		.write = memcg_write_event_control,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4062
4063
4064
4065
  		.flags = CFTYPE_NO_PREFIX,
  		.mode = S_IWUGO,
  	},
  	{
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4066
4067
4068
4069
  		.name = "swappiness",
  		.read_u64 = mem_cgroup_swappiness_read,
  		.write_u64 = mem_cgroup_swappiness_write,
  	},
7dc74be03   Daisuke Nishimura   memcg: add interf...
4070
4071
4072
4073
4074
  	{
  		.name = "move_charge_at_immigrate",
  		.read_u64 = mem_cgroup_move_charge_read,
  		.write_u64 = mem_cgroup_move_charge_write,
  	},
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4075
4076
  	{
  		.name = "oom_control",
2da8ca822   Tejun Heo   cgroup: replace c...
4077
  		.seq_show = mem_cgroup_oom_control_read,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4078
  		.write_u64 = mem_cgroup_oom_control_write,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4079
4080
  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  	},
70ddf637e   Anton Vorontsov   memcg: add memory...
4081
4082
  	{
  		.name = "pressure_level",
70ddf637e   Anton Vorontsov   memcg: add memory...
4083
  	},
406eb0c9b   Ying Han   memcg: add memory...
4084
4085
4086
  #ifdef CONFIG_NUMA
  	{
  		.name = "numa_stat",
2da8ca822   Tejun Heo   cgroup: replace c...
4087
  		.seq_show = memcg_numa_stat_show,
406eb0c9b   Ying Han   memcg: add memory...
4088
4089
  	},
  #endif
510fc4e11   Glauber Costa   memcg: kmem accou...
4090
4091
4092
4093
  #ifdef CONFIG_MEMCG_KMEM
  	{
  		.name = "kmem.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4094
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4095
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4096
4097
4098
4099
  	},
  	{
  		.name = "kmem.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
4100
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4101
4102
4103
4104
  	},
  	{
  		.name = "kmem.failcnt",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
4105
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4106
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4107
4108
4109
4110
  	},
  	{
  		.name = "kmem.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
4111
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4112
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4113
  	},
749c54151   Glauber Costa   memcg: aggregate ...
4114
4115
4116
  #ifdef CONFIG_SLABINFO
  	{
  		.name = "kmem.slabinfo",
b047501cd   Vladimir Davydov   memcg: use generi...
4117
4118
4119
4120
  		.seq_start = slab_start,
  		.seq_next = slab_next,
  		.seq_stop = slab_stop,
  		.seq_show = memcg_slab_show,
749c54151   Glauber Costa   memcg: aggregate ...
4121
4122
  	},
  #endif
510fc4e11   Glauber Costa   memcg: kmem accou...
4123
  #endif
6bc103498   Tejun Heo   cgroup: convert m...
4124
  	{ },	/* terminate */
af36f906c   Tejun Heo   memcg: always cre...
4125
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4126

c0ff4b854   Raghavendra K T   memcg: rename mem...
4127
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4128
4129
  {
  	struct mem_cgroup_per_node *pn;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4130
  	struct mem_cgroup_per_zone *mz;
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
4131
  	int zone, tmp = node;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4132
4133
4134
4135
4136
4137
4138
4139
  	/*
  	 * This routine is called against possible nodes.
  	 * But it's BUG to call kmalloc() against offline node.
  	 *
  	 * TODO: this routine can waste much memory for nodes which will
  	 *       never be onlined. It's better to use memory hotplug callback
  	 *       function.
  	 */
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
4140
4141
  	if (!node_state(node, N_NORMAL_MEMORY))
  		tmp = -1;
17295c88a   Jesper Juhl   memcg: use [kv]za...
4142
  	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4143
4144
  	if (!pn)
  		return 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4145

1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4146
4147
  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  		mz = &pn->zoneinfo[zone];
bea8c150a   Hugh Dickins   memcg: fix hotplu...
4148
  		lruvec_init(&mz->lruvec);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
4149
4150
  		mz->usage_in_excess = 0;
  		mz->on_tree = false;
d79154bb5   Hugh Dickins   memcg: replace me...
4151
  		mz->memcg = memcg;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4152
  	}
54f72fe02   Johannes Weiner   memcg: clean up m...
4153
  	memcg->nodeinfo[node] = pn;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4154
4155
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4156
  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4157
  {
54f72fe02   Johannes Weiner   memcg: clean up m...
4158
  	kfree(memcg->nodeinfo[node]);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4159
  }
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4160
4161
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
4162
  	struct mem_cgroup *memcg;
8ff69e2c8   Vladimir Davydov   memcg: do not use...
4163
  	size_t size;
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4164

8ff69e2c8   Vladimir Davydov   memcg: do not use...
4165
4166
  	size = sizeof(struct mem_cgroup);
  	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4167

8ff69e2c8   Vladimir Davydov   memcg: do not use...
4168
  	memcg = kzalloc(size, GFP_KERNEL);
d79154bb5   Hugh Dickins   memcg: replace me...
4169
  	if (!memcg)
e7bbcdf37   Dan Carpenter   memcontrol: fix p...
4170
  		return NULL;
d79154bb5   Hugh Dickins   memcg: replace me...
4171
4172
  	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
  	if (!memcg->stat)
d2e61b8dc   Dan Carpenter   memcg: null deref...
4173
  		goto out_free;
d79154bb5   Hugh Dickins   memcg: replace me...
4174
4175
  	spin_lock_init(&memcg->pcp_counter_lock);
  	return memcg;
d2e61b8dc   Dan Carpenter   memcg: null deref...
4176
4177
  
  out_free:
8ff69e2c8   Vladimir Davydov   memcg: do not use...
4178
  	kfree(memcg);
d2e61b8dc   Dan Carpenter   memcg: null deref...
4179
  	return NULL;
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4180
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4181
  /*
c8b2a36fb   Glauber Costa   memcg: execute th...
4182
4183
4184
4185
4186
4187
4188
4189
   * At destroying mem_cgroup, references from swap_cgroup can remain.
   * (scanning all at force_empty is too costly...)
   *
   * Instead of clearing all references at force_empty, we remember
   * the number of reference from swap_cgroup and free mem_cgroup when
   * it goes down to 0.
   *
   * Removal of cgroup itself succeeds regardless of refs from swap.
59927fb98   Hugh Dickins   memcg: free mem_c...
4190
   */
c8b2a36fb   Glauber Costa   memcg: execute th...
4191
4192
  
  static void __mem_cgroup_free(struct mem_cgroup *memcg)
59927fb98   Hugh Dickins   memcg: free mem_c...
4193
  {
c8b2a36fb   Glauber Costa   memcg: execute th...
4194
  	int node;
59927fb98   Hugh Dickins   memcg: free mem_c...
4195

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
4196
  	mem_cgroup_remove_from_trees(memcg);
c8b2a36fb   Glauber Costa   memcg: execute th...
4197
4198
4199
4200
4201
  
  	for_each_node(node)
  		free_mem_cgroup_per_zone_info(memcg, node);
  
  	free_percpu(memcg->stat);
8ff69e2c8   Vladimir Davydov   memcg: do not use...
4202
  	kfree(memcg);
59927fb98   Hugh Dickins   memcg: free mem_c...
4203
  }
3afe36b1f   Glauber Costa   memcg: always fre...
4204

7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4205
4206
4207
  /*
   * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
   */
e1aab161e   Glauber Costa   socket: initial c...
4208
  struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4209
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4210
  	if (!memcg->memory.parent)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4211
  		return NULL;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4212
  	return mem_cgroup_from_counter(memcg->memory.parent, memory);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4213
  }
e1aab161e   Glauber Costa   socket: initial c...
4214
  EXPORT_SYMBOL(parent_mem_cgroup);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4215

0eb253e22   Li Zefan   memcg: fix sectio...
4216
  static struct cgroup_subsys_state * __ref
eb95419b0   Tejun Heo   cgroup: pass arou...
4217
  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8cdea7c05   Balbir Singh   Memory controller...
4218
  {
d142e3e66   Glauber Costa   memcg: split part...
4219
  	struct mem_cgroup *memcg;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4220
  	long error = -ENOMEM;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4221
  	int node;
8cdea7c05   Balbir Singh   Memory controller...
4222

c0ff4b854   Raghavendra K T   memcg: rename mem...
4223
4224
  	memcg = mem_cgroup_alloc();
  	if (!memcg)
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4225
  		return ERR_PTR(error);
78fb74669   Pavel Emelianov   Memory controller...
4226

3ed28fa10   Bob Liu   memcg: cleanup fo...
4227
  	for_each_node(node)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4228
  		if (alloc_mem_cgroup_per_zone_info(memcg, node))
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4229
  			goto free_out;
f64c3f549   Balbir Singh   memory controller...
4230

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4231
  	/* root ? */
eb95419b0   Tejun Heo   cgroup: pass arou...
4232
  	if (parent_css == NULL) {
a41c58a66   Hillf Danton   memcg: keep root ...
4233
  		root_mem_cgroup = memcg;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4234
  		page_counter_init(&memcg->memory, NULL);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4235
  		memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4236
  		memcg->soft_limit = PAGE_COUNTER_MAX;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4237
4238
  		page_counter_init(&memcg->memsw, NULL);
  		page_counter_init(&memcg->kmem, NULL);
18f59ea7d   Balbir Singh   memcg: memory cgr...
4239
  	}
28dbc4b6a   Balbir Singh   memcg: memory cgr...
4240

d142e3e66   Glauber Costa   memcg: split part...
4241
4242
  	memcg->last_scanned_node = MAX_NUMNODES;
  	INIT_LIST_HEAD(&memcg->oom_notify);
d142e3e66   Glauber Costa   memcg: split part...
4243
4244
4245
  	memcg->move_charge_at_immigrate = 0;
  	mutex_init(&memcg->thresholds_lock);
  	spin_lock_init(&memcg->move_lock);
70ddf637e   Anton Vorontsov   memcg: add memory...
4246
  	vmpressure_init(&memcg->vmpressure);
fba948078   Tejun Heo   cgroup, memcg: mo...
4247
4248
  	INIT_LIST_HEAD(&memcg->event_list);
  	spin_lock_init(&memcg->event_list_lock);
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
4249
4250
  #ifdef CONFIG_MEMCG_KMEM
  	memcg->kmemcg_id = -1;
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
4251
  #endif
d142e3e66   Glauber Costa   memcg: split part...
4252
4253
4254
4255
4256
4257
4258
4259
4260
  
  	return &memcg->css;
  
  free_out:
  	__mem_cgroup_free(memcg);
  	return ERR_PTR(error);
  }
  
  static int
eb95419b0   Tejun Heo   cgroup: pass arou...
4261
  mem_cgroup_css_online(struct cgroup_subsys_state *css)
d142e3e66   Glauber Costa   memcg: split part...
4262
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4263
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5c9d535b8   Tejun Heo   cgroup: remove cs...
4264
  	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
4265
  	int ret;
d142e3e66   Glauber Costa   memcg: split part...
4266

15a4c835e   Tejun Heo   cgroup, memcg: im...
4267
  	if (css->id > MEM_CGROUP_ID_MAX)
4219b2da2   Li Zefan   memcg: fail to cr...
4268
  		return -ENOSPC;
638769869   Tejun Heo   cgroup: add css_p...
4269
  	if (!parent)
d142e3e66   Glauber Costa   memcg: split part...
4270
  		return 0;
0999821b1   Glauber Costa   memcg: replace cg...
4271
  	mutex_lock(&memcg_create_mutex);
d142e3e66   Glauber Costa   memcg: split part...
4272
4273
4274
4275
4276
4277
  
  	memcg->use_hierarchy = parent->use_hierarchy;
  	memcg->oom_kill_disable = parent->oom_kill_disable;
  	memcg->swappiness = mem_cgroup_swappiness(parent);
  
  	if (parent->use_hierarchy) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4278
  		page_counter_init(&memcg->memory, &parent->memory);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4279
  		memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4280
  		memcg->soft_limit = PAGE_COUNTER_MAX;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4281
4282
  		page_counter_init(&memcg->memsw, &parent->memsw);
  		page_counter_init(&memcg->kmem, &parent->kmem);
55007d849   Glauber Costa   memcg: allocate m...
4283

7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4284
  		/*
8d76a9797   Li Zefan   memcg: don't need...
4285
4286
  		 * No need to take a reference to the parent because cgroup
  		 * core guarantees its existence.
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4287
  		 */
18f59ea7d   Balbir Singh   memcg: memory cgr...
4288
  	} else {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4289
  		page_counter_init(&memcg->memory, NULL);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4290
  		memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4291
  		memcg->soft_limit = PAGE_COUNTER_MAX;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4292
4293
  		page_counter_init(&memcg->memsw, NULL);
  		page_counter_init(&memcg->kmem, NULL);
8c7f6edbd   Tejun Heo   cgroup: mark subs...
4294
4295
4296
4297
4298
  		/*
  		 * Deeper hierachy with use_hierarchy == false doesn't make
  		 * much sense so let cgroup subsystem know about this
  		 * unfortunate state in our controller.
  		 */
d142e3e66   Glauber Costa   memcg: split part...
4299
  		if (parent != root_mem_cgroup)
073219e99   Tejun Heo   cgroup: clean up ...
4300
  			memory_cgrp_subsys.broken_hierarchy = true;
18f59ea7d   Balbir Singh   memcg: memory cgr...
4301
  	}
0999821b1   Glauber Costa   memcg: replace cg...
4302
  	mutex_unlock(&memcg_create_mutex);
d64416377   Vladimir Davydov   memcg: rework mem...
4303

2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
  	ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
  	if (ret)
  		return ret;
  
  	/*
  	 * Make sure the memcg is initialized: mem_cgroup_iter()
  	 * orders reading memcg->initialized against its callers
  	 * reading the memcg members.
  	 */
  	smp_store_release(&memcg->initialized, 1);
  
  	return 0;
8cdea7c05   Balbir Singh   Memory controller...
4316
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4317
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4318
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4319
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
4320
  	struct mem_cgroup_event *event, *tmp;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4321
4322
4323
4324
4325
4326
  
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace.
  	 */
fba948078   Tejun Heo   cgroup, memcg: mo...
4327
4328
  	spin_lock(&memcg->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4329
4330
4331
  		list_del_init(&event->list);
  		schedule_work(&event->remove);
  	}
fba948078   Tejun Heo   cgroup, memcg: mo...
4332
  	spin_unlock(&memcg->event_list_lock);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4333

33cb876e9   Michal Hocko   vmpressure: make ...
4334
  	vmpressure_cleanup(&memcg->vmpressure);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
4335
4336
  
  	memcg_deactivate_kmem(memcg);
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4337
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4338
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
8cdea7c05   Balbir Singh   Memory controller...
4339
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4340
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
c268e9946   Daisuke Nishimura   memcg: fix hierar...
4341

10d5ebf40   Li Zefan   memcg: use css_ge...
4342
  	memcg_destroy_kmem(memcg);
465939a1f   Li Zefan   memcg: don't need...
4343
  	__mem_cgroup_free(memcg);
8cdea7c05   Balbir Singh   Memory controller...
4344
  }
1ced953b1   Tejun Heo   blkcg, memcg: mak...
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
  /**
   * mem_cgroup_css_reset - reset the states of a mem_cgroup
   * @css: the target css
   *
   * Reset the states of the mem_cgroup associated with @css.  This is
   * invoked when the userland requests disabling on the default hierarchy
   * but the memcg is pinned through dependency.  The memcg should stop
   * applying policies and should revert to the vanilla state as it may be
   * made visible again.
   *
   * The current implementation only resets the essential configurations.
   * This needs to be expanded to cover all the visible parts.
   */
  static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4361
4362
4363
  	mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
  	mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
  	memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4364
4365
  	memcg->low = 0;
  	memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4366
  	memcg->soft_limit = PAGE_COUNTER_MAX;
1ced953b1   Tejun Heo   blkcg, memcg: mak...
4367
  }
024914477   Daisuke Nishimura   memcg: move charg...
4368
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
4369
  /* Handlers for move charge at task migration. */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4370
  static int mem_cgroup_do_precharge(unsigned long count)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4371
  {
05b843012   Johannes Weiner   mm: memcontrol: u...
4372
  	int ret;
9476db974   Johannes Weiner   mm: memcontrol: s...
4373
4374
  
  	/* Try a single bulk charge without reclaim first */
00501b531   Johannes Weiner   mm: memcontrol: r...
4375
  	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
9476db974   Johannes Weiner   mm: memcontrol: s...
4376
  	if (!ret) {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4377
  		mc.precharge += count;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4378
4379
  		return ret;
  	}
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4380
  	if (ret == -EINTR) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4381
  		cancel_charge(root_mem_cgroup, count);
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4382
4383
  		return ret;
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
4384
4385
  
  	/* Try charges one by one with reclaim */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4386
  	while (count--) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4387
  		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
9476db974   Johannes Weiner   mm: memcontrol: s...
4388
4389
4390
  		/*
  		 * In case of failure, any residual charges against
  		 * mc.to will be dropped by mem_cgroup_clear_mc()
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4391
4392
  		 * later on.  However, cancel any charges that are
  		 * bypassed to root right away or they'll be lost.
9476db974   Johannes Weiner   mm: memcontrol: s...
4393
  		 */
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4394
  		if (ret == -EINTR)
00501b531   Johannes Weiner   mm: memcontrol: r...
4395
  			cancel_charge(root_mem_cgroup, 1);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4396
  		if (ret)
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4397
  			return ret;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4398
  		mc.precharge++;
9476db974   Johannes Weiner   mm: memcontrol: s...
4399
  		cond_resched();
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4400
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
4401
  	return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4402
4403
4404
  }
  
  /**
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4405
   * get_mctgt_type - get target type of moving charge
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4406
4407
4408
   * @vma: the vma the pte to be checked belongs
   * @addr: the address corresponding to the pte to be checked
   * @ptent: the pte to be checked
024914477   Daisuke Nishimura   memcg: move charg...
4409
   * @target: the pointer the target page or swap ent will be stored(can be NULL)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4410
4411
4412
4413
4414
4415
   *
   * Returns
   *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
   *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
   *     move charge. if @target is not NULL, the page is stored in target->page
   *     with extra refcnt got(Callers should handle it).
024914477   Daisuke Nishimura   memcg: move charg...
4416
4417
4418
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4419
4420
4421
   *
   * Called with pte lock held.
   */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4422
4423
  union mc_target {
  	struct page	*page;
024914477   Daisuke Nishimura   memcg: move charg...
4424
  	swp_entry_t	ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4425
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4426
  enum mc_target_type {
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4427
  	MC_TARGET_NONE = 0,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4428
  	MC_TARGET_PAGE,
024914477   Daisuke Nishimura   memcg: move charg...
4429
  	MC_TARGET_SWAP,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4430
  };
90254a658   Daisuke Nishimura   memcg: clean up m...
4431
4432
  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
  						unsigned long addr, pte_t ptent)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4433
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4434
  	struct page *page = vm_normal_page(vma, addr, ptent);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4435

90254a658   Daisuke Nishimura   memcg: clean up m...
4436
4437
4438
  	if (!page || !page_mapped(page))
  		return NULL;
  	if (PageAnon(page)) {
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4439
  		if (!(mc.flags & MOVE_ANON))
90254a658   Daisuke Nishimura   memcg: clean up m...
4440
  			return NULL;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4441
4442
4443
4444
  	} else {
  		if (!(mc.flags & MOVE_FILE))
  			return NULL;
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4445
4446
4447
4448
4449
  	if (!get_page_unless_zero(page))
  		return NULL;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4450
  #ifdef CONFIG_SWAP
90254a658   Daisuke Nishimura   memcg: clean up m...
4451
4452
4453
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4454
4455
  	struct page *page = NULL;
  	swp_entry_t ent = pte_to_swp_entry(ptent);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4456
  	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
90254a658   Daisuke Nishimura   memcg: clean up m...
4457
  		return NULL;
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4458
4459
4460
4461
  	/*
  	 * Because lookup_swap_cache() updates some statistics counter,
  	 * we call find_get_page() with swapper_space directly.
  	 */
33806f06d   Shaohua Li   swap: make each s...
4462
  	page = find_get_page(swap_address_space(ent), ent.val);
90254a658   Daisuke Nishimura   memcg: clean up m...
4463
4464
4465
4466
4467
  	if (do_swap_account)
  		entry->val = ent.val;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4468
4469
4470
4471
4472
4473
4474
  #else
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	return NULL;
  }
  #endif
90254a658   Daisuke Nishimura   memcg: clean up m...
4475

87946a722   Daisuke Nishimura   memcg: move charg...
4476
4477
4478
4479
  static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	struct page *page = NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4480
4481
4482
4483
4484
  	struct address_space *mapping;
  	pgoff_t pgoff;
  
  	if (!vma->vm_file) /* anonymous vma */
  		return NULL;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4485
  	if (!(mc.flags & MOVE_FILE))
87946a722   Daisuke Nishimura   memcg: move charg...
4486
  		return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4487
  	mapping = vma->vm_file->f_mapping;
0661a3361   Kirill A. Shutemov   mm: remove rest u...
4488
  	pgoff = linear_page_index(vma, addr);
87946a722   Daisuke Nishimura   memcg: move charg...
4489
4490
  
  	/* page is moved even if it's not RSS of this task(page-faulted). */
aa3b18955   Hugh Dickins   tmpfs: convert me...
4491
4492
  #ifdef CONFIG_SWAP
  	/* shmem/tmpfs may report page out on swap: account for that too. */
139b6a6fb   Johannes Weiner   mm: filemap: upda...
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
  	if (shmem_mapping(mapping)) {
  		page = find_get_entry(mapping, pgoff);
  		if (radix_tree_exceptional_entry(page)) {
  			swp_entry_t swp = radix_to_swp_entry(page);
  			if (do_swap_account)
  				*entry = swp;
  			page = find_get_page(swap_address_space(swp), swp.val);
  		}
  	} else
  		page = find_get_page(mapping, pgoff);
  #else
  	page = find_get_page(mapping, pgoff);
aa3b18955   Hugh Dickins   tmpfs: convert me...
4505
  #endif
87946a722   Daisuke Nishimura   memcg: move charg...
4506
4507
  	return page;
  }
b1b0deabb   Chen Gang   mm: memcontrol: l...
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
  /**
   * mem_cgroup_move_account - move account of the page
   * @page: the page
   * @nr_pages: number of regular pages (>1 for huge pages)
   * @from: mem_cgroup which the page is moved from.
   * @to:	mem_cgroup which the page is moved to. @from != @to.
   *
   * The caller must confirm following.
   * - page is not on LRU (isolate_page() is useful.)
   * - compound_lock is held when nr_pages > 1
   *
   * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
   * from old cgroup.
   */
  static int mem_cgroup_move_account(struct page *page,
  				   unsigned int nr_pages,
  				   struct mem_cgroup *from,
  				   struct mem_cgroup *to)
  {
  	unsigned long flags;
  	int ret;
  
  	VM_BUG_ON(from == to);
  	VM_BUG_ON_PAGE(PageLRU(page), page);
  	/*
  	 * The page is isolated from LRU. So, collapse function
  	 * will not handle this page. But page splitting can happen.
  	 * Do this check under compound_page_lock(). The caller should
  	 * hold it.
  	 */
  	ret = -EBUSY;
  	if (nr_pages > 1 && !PageTransHuge(page))
  		goto out;
  
  	/*
  	 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
  	 * of its source page while we change it: page migration takes
  	 * both pages off the LRU, but page cache replacement doesn't.
  	 */
  	if (!trylock_page(page))
  		goto out;
  
  	ret = -EINVAL;
  	if (page->mem_cgroup != from)
  		goto out_unlock;
  
  	spin_lock_irqsave(&from->move_lock, flags);
  
  	if (!PageAnon(page) && page_mapped(page)) {
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  	}
  
  	if (PageWriteback(page)) {
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  	}
  
  	/*
  	 * It is safe to change page->mem_cgroup here because the page
  	 * is referenced, charged, and isolated - we can't race with
  	 * uncharging, charging, migration, or LRU putback.
  	 */
  
  	/* caller should have done css_get */
  	page->mem_cgroup = to;
  	spin_unlock_irqrestore(&from->move_lock, flags);
  
  	ret = 0;
  
  	local_irq_disable();
  	mem_cgroup_charge_statistics(to, page, nr_pages);
  	memcg_check_events(to, page);
  	mem_cgroup_charge_statistics(from, page, -nr_pages);
  	memcg_check_events(from, page);
  	local_irq_enable();
  out_unlock:
  	unlock_page(page);
  out:
  	return ret;
  }
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4593
  static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
90254a658   Daisuke Nishimura   memcg: clean up m...
4594
4595
4596
  		unsigned long addr, pte_t ptent, union mc_target *target)
  {
  	struct page *page = NULL;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4597
  	enum mc_target_type ret = MC_TARGET_NONE;
90254a658   Daisuke Nishimura   memcg: clean up m...
4598
4599
4600
4601
4602
4603
  	swp_entry_t ent = { .val = 0 };
  
  	if (pte_present(ptent))
  		page = mc_handle_present_pte(vma, addr, ptent);
  	else if (is_swap_pte(ptent))
  		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
4604
  	else if (pte_none(ptent))
87946a722   Daisuke Nishimura   memcg: move charg...
4605
  		page = mc_handle_file_pte(vma, addr, ptent, &ent);
90254a658   Daisuke Nishimura   memcg: clean up m...
4606
4607
  
  	if (!page && !ent.val)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4608
  		return ret;
024914477   Daisuke Nishimura   memcg: move charg...
4609
  	if (page) {
024914477   Daisuke Nishimura   memcg: move charg...
4610
  		/*
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
4611
  		 * Do only loose check w/o serialization.
1306a85ae   Johannes Weiner   mm: embed the mem...
4612
  		 * mem_cgroup_move_account() checks the page is valid or
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
4613
  		 * not under LRU exclusion.
024914477   Daisuke Nishimura   memcg: move charg...
4614
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
4615
  		if (page->mem_cgroup == mc.from) {
024914477   Daisuke Nishimura   memcg: move charg...
4616
4617
4618
4619
4620
4621
4622
  			ret = MC_TARGET_PAGE;
  			if (target)
  				target->page = page;
  		}
  		if (!ret || !target)
  			put_page(page);
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4623
4624
  	/* There is a swap entry and a page doesn't exist or isn't charged */
  	if (ent.val && !ret &&
34c00c319   Li Zefan   memcg: convert to...
4625
  	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4626
4627
4628
  		ret = MC_TARGET_SWAP;
  		if (target)
  			target->ent = ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4629
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4630
4631
  	return ret;
  }
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * We don't consider swapping or file mapped pages because THP does not
   * support them for now.
   * Caller should make sure that pmd_trans_huge(pmd) is true.
   */
  static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	struct page *page = NULL;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4642
4643
4644
  	enum mc_target_type ret = MC_TARGET_NONE;
  
  	page = pmd_page(pmd);
309381fea   Sasha Levin   mm: dump page whe...
4645
  	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4646
  	if (!(mc.flags & MOVE_ANON))
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4647
  		return ret;
1306a85ae   Johannes Weiner   mm: embed the mem...
4648
  	if (page->mem_cgroup == mc.from) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
  		ret = MC_TARGET_PAGE;
  		if (target) {
  			get_page(page);
  			target->page = page;
  		}
  	}
  	return ret;
  }
  #else
  static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	return MC_TARGET_NONE;
  }
  #endif
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4664
4665
4666
4667
  static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  					unsigned long addr, unsigned long end,
  					struct mm_walk *walk)
  {
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4668
  	struct vm_area_struct *vma = walk->vma;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4669
4670
  	pte_t *pte;
  	spinlock_t *ptl;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4671
  	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4672
4673
  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
  			mc.precharge += HPAGE_PMD_NR;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4674
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
4675
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4676
  	}
033193275   Dave Hansen   pagewalk: only sp...
4677

45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
4678
4679
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4680
4681
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; pte++, addr += PAGE_SIZE)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4682
  		if (get_mctgt_type(vma, addr, *pte, NULL))
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4683
4684
4685
  			mc.precharge++;	/* increment precharge temporarily */
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4686
4687
  	return 0;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4688
4689
4690
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
  	unsigned long precharge;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4691

26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4692
4693
4694
4695
  	struct mm_walk mem_cgroup_count_precharge_walk = {
  		.pmd_entry = mem_cgroup_count_precharge_pte_range,
  		.mm = mm,
  	};
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4696
  	down_read(&mm->mmap_sem);
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4697
  	walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4698
  	up_read(&mm->mmap_sem);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4699
4700
4701
4702
4703
4704
  
  	precharge = mc.precharge;
  	mc.precharge = 0;
  
  	return precharge;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4705
4706
  static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4707
4708
4709
4710
4711
  	unsigned long precharge = mem_cgroup_count_precharge(mm);
  
  	VM_BUG_ON(mc.moving_task);
  	mc.moving_task = current;
  	return mem_cgroup_do_precharge(precharge);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4712
  }
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4713
4714
  /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
  static void __mem_cgroup_clear_mc(void)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4715
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4716
4717
  	struct mem_cgroup *from = mc.from;
  	struct mem_cgroup *to = mc.to;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4718
  	/* we must uncharge all the leftover precharges from mc.to */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4719
  	if (mc.precharge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4720
  		cancel_charge(mc.to, mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4721
4722
4723
4724
4725
4726
4727
  		mc.precharge = 0;
  	}
  	/*
  	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
  	 * we must uncharge here.
  	 */
  	if (mc.moved_charge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4728
  		cancel_charge(mc.from, mc.moved_charge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4729
  		mc.moved_charge = 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4730
  	}
483c30b51   Daisuke Nishimura   memcg: improve pe...
4731
4732
  	/* we must fixup refcnts and charges */
  	if (mc.moved_swap) {
483c30b51   Daisuke Nishimura   memcg: improve pe...
4733
  		/* uncharge swap account from the old cgroup */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4734
  		if (!mem_cgroup_is_root(mc.from))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4735
  			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
483c30b51   Daisuke Nishimura   memcg: improve pe...
4736

05b843012   Johannes Weiner   mm: memcontrol: u...
4737
  		/*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4738
4739
  		 * we charged both to->memory and to->memsw, so we
  		 * should uncharge to->memory.
05b843012   Johannes Weiner   mm: memcontrol: u...
4740
  		 */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4741
  		if (!mem_cgroup_is_root(mc.to))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4742
  			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
4743
  		css_put_many(&mc.from->css, mc.moved_swap);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4744

4050377b5   Li Zefan   memcg: use css_ge...
4745
  		/* we've already done css_get(mc.to) */
483c30b51   Daisuke Nishimura   memcg: improve pe...
4746
4747
  		mc.moved_swap = 0;
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4748
4749
4750
4751
4752
4753
4754
  	memcg_oom_recover(from);
  	memcg_oom_recover(to);
  	wake_up_all(&mc.waitq);
  }
  
  static void mem_cgroup_clear_mc(void)
  {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4755
4756
4757
4758
4759
4760
  	/*
  	 * we must clear moving_task before waking up waiters at the end of
  	 * task migration.
  	 */
  	mc.moving_task = NULL;
  	__mem_cgroup_clear_mc();
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4761
  	spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4762
4763
  	mc.from = NULL;
  	mc.to = NULL;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4764
  	spin_unlock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4765
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4766
  static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4767
  				 struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4768
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
4769
  	struct task_struct *p = cgroup_taskset_first(tset);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4770
  	int ret = 0;
eb95419b0   Tejun Heo   cgroup: pass arou...
4771
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4772
  	unsigned long move_flags;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4773

ee5e8472b   Glauber Costa   memcg: prevent ch...
4774
4775
4776
4777
4778
  	/*
  	 * We are now commited to this value whatever it is. Changes in this
  	 * tunable will only affect upcoming migrations, not the current one.
  	 * So we need to save it, and keep it going.
  	 */
4db0c3c29   Jason Low   mm: remove rest o...
4779
  	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4780
  	if (move_flags) {
7dc74be03   Daisuke Nishimura   memcg: add interf...
4781
4782
  		struct mm_struct *mm;
  		struct mem_cgroup *from = mem_cgroup_from_task(p);
c0ff4b854   Raghavendra K T   memcg: rename mem...
4783
  		VM_BUG_ON(from == memcg);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4784
4785
4786
4787
  
  		mm = get_task_mm(p);
  		if (!mm)
  			return 0;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4788
  		/* We move charges only when we move a owner of the mm */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4789
4790
4791
4792
  		if (mm->owner == p) {
  			VM_BUG_ON(mc.from);
  			VM_BUG_ON(mc.to);
  			VM_BUG_ON(mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4793
  			VM_BUG_ON(mc.moved_charge);
483c30b51   Daisuke Nishimura   memcg: improve pe...
4794
  			VM_BUG_ON(mc.moved_swap);
247b1447b   Johannes Weiner   mm: memcontrol: f...
4795

2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4796
  			spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4797
  			mc.from = from;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4798
  			mc.to = memcg;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4799
  			mc.flags = move_flags;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4800
  			spin_unlock(&mc.lock);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4801
  			/* We set mc.moving_task later */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4802
4803
4804
4805
  
  			ret = mem_cgroup_precharge_mc(mm);
  			if (ret)
  				mem_cgroup_clear_mc();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4806
4807
  		}
  		mmput(mm);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4808
4809
4810
  	}
  	return ret;
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4811
  static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4812
  				     struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4813
  {
4e2f245d3   Johannes Weiner   mm: memcontrol: d...
4814
4815
  	if (mc.to)
  		mem_cgroup_clear_mc();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4816
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4817
4818
4819
  static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4820
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4821
  	int ret = 0;
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4822
  	struct vm_area_struct *vma = walk->vma;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4823
4824
  	pte_t *pte;
  	spinlock_t *ptl;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4825
4826
4827
  	enum mc_target_type target_type;
  	union mc_target target;
  	struct page *page;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4828

12724850e   Naoya Horiguchi   memcg: avoid THP ...
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
  	/*
  	 * We don't take compound_lock() here but no race with splitting thp
  	 * happens because:
  	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
  	 *    under splitting, which means there's no concurrent thp split,
  	 *  - if another thread runs into split_huge_page() just after we
  	 *    entered this if-block, the thread must wait for page table lock
  	 *    to be unlocked in __split_huge_page_splitting(), where the main
  	 *    part of thp split is not executed yet.
  	 */
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4839
  	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
62ade86ab   Hugh Dickins   memcg,thp: fix re...
4840
  		if (mc.precharge < HPAGE_PMD_NR) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4841
  			spin_unlock(ptl);
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4842
4843
4844
4845
4846
4847
  			return 0;
  		}
  		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
  		if (target_type == MC_TARGET_PAGE) {
  			page = target.page;
  			if (!isolate_lru_page(page)) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4848
  				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
1306a85ae   Johannes Weiner   mm: embed the mem...
4849
  							     mc.from, mc.to)) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4850
4851
4852
4853
4854
4855
4856
  					mc.precharge -= HPAGE_PMD_NR;
  					mc.moved_charge += HPAGE_PMD_NR;
  				}
  				putback_lru_page(page);
  			}
  			put_page(page);
  		}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4857
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
4858
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4859
  	}
45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
4860
4861
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4862
4863
4864
4865
  retry:
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; addr += PAGE_SIZE) {
  		pte_t ptent = *(pte++);
024914477   Daisuke Nishimura   memcg: move charg...
4866
  		swp_entry_t ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4867
4868
4869
  
  		if (!mc.precharge)
  			break;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4870
  		switch (get_mctgt_type(vma, addr, ptent, &target)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4871
4872
4873
4874
  		case MC_TARGET_PAGE:
  			page = target.page;
  			if (isolate_lru_page(page))
  				goto put;
1306a85ae   Johannes Weiner   mm: embed the mem...
4875
  			if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4876
  				mc.precharge--;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4877
4878
  				/* we uncharge from mc.from later. */
  				mc.moved_charge++;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4879
4880
  			}
  			putback_lru_page(page);
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4881
  put:			/* get_mctgt_type() gets the page */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4882
4883
  			put_page(page);
  			break;
024914477   Daisuke Nishimura   memcg: move charg...
4884
4885
  		case MC_TARGET_SWAP:
  			ent = target.ent;
e91cbb425   Hugh Dickins   memcg swap: mem_c...
4886
  			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
024914477   Daisuke Nishimura   memcg: move charg...
4887
  				mc.precharge--;
483c30b51   Daisuke Nishimura   memcg: improve pe...
4888
4889
4890
  				/* we fixup refcnts and charges later. */
  				mc.moved_swap++;
  			}
024914477   Daisuke Nishimura   memcg: move charg...
4891
  			break;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
  		default:
  			break;
  		}
  	}
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
  
  	if (addr != end) {
  		/*
  		 * We have consumed all precharges we got in can_attach().
  		 * We try charge one by one, but don't do any additional
  		 * charges to mc.to if we have failed in charge once in attach()
  		 * phase.
  		 */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4906
  		ret = mem_cgroup_do_precharge(1);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4907
4908
4909
4910
4911
4912
4913
4914
4915
  		if (!ret)
  			goto retry;
  	}
  
  	return ret;
  }
  
  static void mem_cgroup_move_charge(struct mm_struct *mm)
  {
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4916
4917
4918
4919
  	struct mm_walk mem_cgroup_move_charge_walk = {
  		.pmd_entry = mem_cgroup_move_charge_pte_range,
  		.mm = mm,
  	};
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4920
4921
  
  	lru_add_drain_all();
312722cbb   Johannes Weiner   mm: memcontrol: s...
4922
4923
4924
4925
4926
4927
4928
  	/*
  	 * Signal mem_cgroup_begin_page_stat() to take the memcg's
  	 * move_lock while we're moving its pages to another memcg.
  	 * Then wait for already started RCU-only updates to finish.
  	 */
  	atomic_inc(&mc.from->moving_account);
  	synchronize_rcu();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
  retry:
  	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  		/*
  		 * Someone who are holding the mmap_sem might be waiting in
  		 * waitq. So we cancel all extra charges, wake up all waiters,
  		 * and retry. Because we cancel precharges, we might not be able
  		 * to move enough charges, but moving charge is a best-effort
  		 * feature anyway, so it wouldn't be a big problem.
  		 */
  		__mem_cgroup_clear_mc();
  		cond_resched();
  		goto retry;
  	}
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4942
4943
4944
4945
4946
  	/*
  	 * When we have consumed all precharges and failed in doing
  	 * additional charge, the page walk just aborts.
  	 */
  	walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4947
  	up_read(&mm->mmap_sem);
312722cbb   Johannes Weiner   mm: memcontrol: s...
4948
  	atomic_dec(&mc.from->moving_account);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4949
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4950
  static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4951
  				 struct cgroup_taskset *tset)
67e465a77   Balbir Singh   Memory controller...
4952
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
4953
  	struct task_struct *p = cgroup_taskset_first(tset);
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
4954
  	struct mm_struct *mm = get_task_mm(p);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4955

dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4956
  	if (mm) {
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
4957
4958
  		if (mc.to)
  			mem_cgroup_move_charge(mm);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4959
4960
  		mmput(mm);
  	}
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
4961
4962
  	if (mc.to)
  		mem_cgroup_clear_mc();
67e465a77   Balbir Singh   Memory controller...
4963
  }
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4964
  #else	/* !CONFIG_MMU */
eb95419b0   Tejun Heo   cgroup: pass arou...
4965
  static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4966
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4967
4968
4969
  {
  	return 0;
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4970
  static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4971
  				     struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4972
4973
  {
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4974
  static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4975
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
4976
4977
4978
  {
  }
  #endif
67e465a77   Balbir Singh   Memory controller...
4979

f00baae7a   Tejun Heo   memcg: force use_...
4980
4981
  /*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
aa6ec29be   Tejun Heo   cgroup: remove sa...
4982
4983
   * to verify whether we're attached to the default hierarchy on each mount
   * attempt.
f00baae7a   Tejun Heo   memcg: force use_...
4984
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
4985
  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
f00baae7a   Tejun Heo   memcg: force use_...
4986
4987
  {
  	/*
aa6ec29be   Tejun Heo   cgroup: remove sa...
4988
  	 * use_hierarchy is forced on the default hierarchy.  cgroup core
f00baae7a   Tejun Heo   memcg: force use_...
4989
4990
4991
  	 * guarantees that @root doesn't have any children, so turning it
  	 * on for the root memcg is enough.
  	 */
aa6ec29be   Tejun Heo   cgroup: remove sa...
4992
  	if (cgroup_on_dfl(root_css->cgroup))
7feee590b   Vladimir Davydov   memcg: disable hi...
4993
4994
4995
  		root_mem_cgroup->use_hierarchy = true;
  	else
  		root_mem_cgroup->use_hierarchy = false;
f00baae7a   Tejun Heo   memcg: force use_...
4996
  }
241994ed8   Johannes Weiner   mm: memcontrol: d...
4997
4998
4999
5000
5001
5002
5003
5004
5005
  static u64 memory_current_read(struct cgroup_subsys_state *css,
  			       struct cftype *cft)
  {
  	return mem_cgroup_usage(mem_cgroup_from_css(css), false);
  }
  
  static int memory_low_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
5006
  	unsigned long low = READ_ONCE(memcg->low);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5007
5008
  
  	if (low == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
5009
5010
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
  	else
  		seq_printf(m, "%llu
  ", (u64)low * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_low_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long low;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
5026
  	err = page_counter_memparse(buf, "max", &low);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
  	if (err)
  		return err;
  
  	memcg->low = low;
  
  	return nbytes;
  }
  
  static int memory_high_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
5038
  	unsigned long high = READ_ONCE(memcg->high);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5039
5040
  
  	if (high == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
5041
5042
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
  	else
  		seq_printf(m, "%llu
  ", (u64)high * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_high_write(struct kernfs_open_file *of,
  				 char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long high;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
5058
  	err = page_counter_memparse(buf, "max", &high);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
  	if (err)
  		return err;
  
  	memcg->high = high;
  
  	return nbytes;
  }
  
  static int memory_max_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
5070
  	unsigned long max = READ_ONCE(memcg->memory.limit);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5071
5072
  
  	if (max == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
5073
5074
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
  	else
  		seq_printf(m, "%llu
  ", (u64)max * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_max_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long max;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
5090
  	err = page_counter_memparse(buf, "max", &max);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
  	if (err)
  		return err;
  
  	err = mem_cgroup_resize_limit(memcg, max);
  	if (err)
  		return err;
  
  	return nbytes;
  }
  
  static int memory_events_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
  
  	seq_printf(m, "low %lu
  ", mem_cgroup_read_events(memcg, MEMCG_LOW));
  	seq_printf(m, "high %lu
  ", mem_cgroup_read_events(memcg, MEMCG_HIGH));
  	seq_printf(m, "max %lu
  ", mem_cgroup_read_events(memcg, MEMCG_MAX));
  	seq_printf(m, "oom %lu
  ", mem_cgroup_read_events(memcg, MEMCG_OOM));
  
  	return 0;
  }
  
  static struct cftype memory_files[] = {
  	{
  		.name = "current",
  		.read_u64 = memory_current_read,
  	},
  	{
  		.name = "low",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_low_show,
  		.write = memory_low_write,
  	},
  	{
  		.name = "high",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_high_show,
  		.write = memory_high_write,
  	},
  	{
  		.name = "max",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_max_show,
  		.write = memory_max_write,
  	},
  	{
  		.name = "events",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_events_show,
  	},
  	{ }	/* terminate */
  };
073219e99   Tejun Heo   cgroup: clean up ...
5147
  struct cgroup_subsys memory_cgrp_subsys = {
92fb97487   Tejun Heo   cgroup: rename ->...
5148
  	.css_alloc = mem_cgroup_css_alloc,
d142e3e66   Glauber Costa   memcg: split part...
5149
  	.css_online = mem_cgroup_css_online,
92fb97487   Tejun Heo   cgroup: rename ->...
5150
5151
  	.css_offline = mem_cgroup_css_offline,
  	.css_free = mem_cgroup_css_free,
1ced953b1   Tejun Heo   blkcg, memcg: mak...
5152
  	.css_reset = mem_cgroup_css_reset,
7dc74be03   Daisuke Nishimura   memcg: add interf...
5153
5154
  	.can_attach = mem_cgroup_can_attach,
  	.cancel_attach = mem_cgroup_cancel_attach,
67e465a77   Balbir Singh   Memory controller...
5155
  	.attach = mem_cgroup_move_task,
f00baae7a   Tejun Heo   memcg: force use_...
5156
  	.bind = mem_cgroup_bind,
241994ed8   Johannes Weiner   mm: memcontrol: d...
5157
5158
  	.dfl_cftypes = memory_files,
  	.legacy_cftypes = mem_cgroup_legacy_files,
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5159
  	.early_init = 0,
8cdea7c05   Balbir Singh   Memory controller...
5160
  };
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5161

241994ed8   Johannes Weiner   mm: memcontrol: d...
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
  /**
   * mem_cgroup_events - count memory events against a cgroup
   * @memcg: the memory cgroup
   * @idx: the event index
   * @nr: the number of events to account for
   */
  void mem_cgroup_events(struct mem_cgroup *memcg,
  		       enum mem_cgroup_events_index idx,
  		       unsigned int nr)
  {
  	this_cpu_add(memcg->stat->events[idx], nr);
  }
  
  /**
   * mem_cgroup_low - check if memory consumption is below the normal range
   * @root: the highest ancestor to consider
   * @memcg: the memory cgroup to check
   *
   * Returns %true if memory consumption of @memcg, and that of all
   * configurable ancestors up to @root, is below the normal range.
   */
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
  {
  	if (mem_cgroup_disabled())
  		return false;
  
  	/*
  	 * The toplevel group doesn't have a configurable range, so
  	 * it's never low when looked at directly, and it is not
  	 * considered an ancestor when assessing the hierarchy.
  	 */
  
  	if (memcg == root_mem_cgroup)
  		return false;
4e54dede3   Michal Hocko   memcg: fix low li...
5196
  	if (page_counter_read(&memcg->memory) >= memcg->low)
241994ed8   Johannes Weiner   mm: memcontrol: d...
5197
5198
5199
5200
5201
5202
5203
  		return false;
  
  	while (memcg != root) {
  		memcg = parent_mem_cgroup(memcg);
  
  		if (memcg == root_mem_cgroup)
  			break;
4e54dede3   Michal Hocko   memcg: fix low li...
5204
  		if (page_counter_read(&memcg->memory) >= memcg->low)
241994ed8   Johannes Weiner   mm: memcontrol: d...
5205
5206
5207
5208
  			return false;
  	}
  	return true;
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
  /**
   * mem_cgroup_try_charge - try charging a page
   * @page: page to charge
   * @mm: mm context of the victim
   * @gfp_mask: reclaim mode
   * @memcgp: charged memcg return
   *
   * Try to charge @page to the memcg that @mm belongs to, reclaiming
   * pages according to @gfp_mask if necessary.
   *
   * Returns 0 on success, with *@memcgp pointing to the charged memcg.
   * Otherwise, an error code is returned.
   *
   * After page->mapping has been set up, the caller must finalize the
   * charge with mem_cgroup_commit_charge().  Or abort the transaction
   * with mem_cgroup_cancel_charge() in case page instantiation fails.
   */
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
  			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
  {
  	struct mem_cgroup *memcg = NULL;
  	unsigned int nr_pages = 1;
  	int ret = 0;
  
  	if (mem_cgroup_disabled())
  		goto out;
  
  	if (PageSwapCache(page)) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5237
5238
5239
5240
5241
5242
5243
  		/*
  		 * Every swap fault against a single page tries to charge the
  		 * page, bail as early as possible.  shmem_unuse() encounters
  		 * already charged pages, too.  The USED bit is protected by
  		 * the page lock, which serializes swap cache removal, which
  		 * in turn serializes uncharging.
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
5244
  		if (page->mem_cgroup)
00501b531   Johannes Weiner   mm: memcontrol: r...
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
  			goto out;
  	}
  
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
  
  	if (do_swap_account && PageSwapCache(page))
  		memcg = try_get_mem_cgroup_from_page(page);
  	if (!memcg)
  		memcg = get_mem_cgroup_from_mm(mm);
  
  	ret = try_charge(memcg, gfp_mask, nr_pages);
  
  	css_put(&memcg->css);
  
  	if (ret == -EINTR) {
  		memcg = root_mem_cgroup;
  		ret = 0;
  	}
  out:
  	*memcgp = memcg;
  	return ret;
  }
  
  /**
   * mem_cgroup_commit_charge - commit a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   * @lrucare: page might be on LRU already
   *
   * Finalize a charge transaction started by mem_cgroup_try_charge(),
   * after page->mapping has been set up.  This must happen atomically
   * as part of the page instantiation, i.e. under the page table lock
   * for anonymous pages, under the page lock for page and swap cache.
   *
   * In addition, the page must not be on the LRU during the commit, to
   * prevent racing with task migration.  If it might be, use @lrucare.
   *
   * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
   */
  void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
  			      bool lrucare)
  {
  	unsigned int nr_pages = 1;
  
  	VM_BUG_ON_PAGE(!page->mapping, page);
  	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5304
  	commit_charge(page, memcg, lrucare);
00501b531   Johannes Weiner   mm: memcontrol: r...
5305
5306
5307
5308
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5309
5310
5311
5312
  	local_irq_disable();
  	mem_cgroup_charge_statistics(memcg, page, nr_pages);
  	memcg_check_events(memcg, page);
  	local_irq_enable();
00501b531   Johannes Weiner   mm: memcontrol: r...
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
  
  	if (do_swap_account && PageSwapCache(page)) {
  		swp_entry_t entry = { .val = page_private(page) };
  		/*
  		 * The swap entry might not get freed for a long time,
  		 * let's not wait for it.  The page already received a
  		 * memory+swap charge, drop the swap entry duplicate.
  		 */
  		mem_cgroup_uncharge_swap(entry);
  	}
  }
  
  /**
   * mem_cgroup_cancel_charge - cancel a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   *
   * Cancel a charge transaction started by mem_cgroup_try_charge().
   */
  void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
  {
  	unsigned int nr_pages = 1;
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
  
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
  
  	cancel_charge(memcg, nr_pages);
  }
747db954c   Johannes Weiner   mm: memcontrol: u...
5353
  static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
747db954c   Johannes Weiner   mm: memcontrol: u...
5354
5355
5356
  			   unsigned long nr_anon, unsigned long nr_file,
  			   unsigned long nr_huge, struct page *dummy_page)
  {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5357
  	unsigned long nr_pages = nr_anon + nr_file;
747db954c   Johannes Weiner   mm: memcontrol: u...
5358
  	unsigned long flags;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5359
  	if (!mem_cgroup_is_root(memcg)) {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5360
5361
5362
  		page_counter_uncharge(&memcg->memory, nr_pages);
  		if (do_swap_account)
  			page_counter_uncharge(&memcg->memsw, nr_pages);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5363
5364
  		memcg_oom_recover(memcg);
  	}
747db954c   Johannes Weiner   mm: memcontrol: u...
5365
5366
5367
5368
5369
5370
  
  	local_irq_save(flags);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
  	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5371
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
747db954c   Johannes Weiner   mm: memcontrol: u...
5372
5373
  	memcg_check_events(memcg, dummy_page);
  	local_irq_restore(flags);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
5374
5375
  
  	if (!mem_cgroup_is_root(memcg))
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5376
  		css_put_many(&memcg->css, nr_pages);
747db954c   Johannes Weiner   mm: memcontrol: u...
5377
5378
5379
5380
5381
  }
  
  static void uncharge_list(struct list_head *page_list)
  {
  	struct mem_cgroup *memcg = NULL;
747db954c   Johannes Weiner   mm: memcontrol: u...
5382
5383
5384
5385
  	unsigned long nr_anon = 0;
  	unsigned long nr_file = 0;
  	unsigned long nr_huge = 0;
  	unsigned long pgpgout = 0;
747db954c   Johannes Weiner   mm: memcontrol: u...
5386
5387
5388
5389
5390
5391
  	struct list_head *next;
  	struct page *page;
  
  	next = page_list->next;
  	do {
  		unsigned int nr_pages = 1;
747db954c   Johannes Weiner   mm: memcontrol: u...
5392
5393
5394
5395
5396
5397
  
  		page = list_entry(next, struct page, lru);
  		next = page->lru.next;
  
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		VM_BUG_ON_PAGE(page_count(page), page);
1306a85ae   Johannes Weiner   mm: embed the mem...
5398
  		if (!page->mem_cgroup)
747db954c   Johannes Weiner   mm: memcontrol: u...
5399
5400
5401
5402
  			continue;
  
  		/*
  		 * Nobody should be changing or seriously looking at
1306a85ae   Johannes Weiner   mm: embed the mem...
5403
  		 * page->mem_cgroup at this point, we have fully
298333157   Johannes Weiner   mm: memcontrol: r...
5404
  		 * exclusive access to the page.
747db954c   Johannes Weiner   mm: memcontrol: u...
5405
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
5406
  		if (memcg != page->mem_cgroup) {
747db954c   Johannes Weiner   mm: memcontrol: u...
5407
  			if (memcg) {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5408
5409
5410
  				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
  					       nr_huge, page);
  				pgpgout = nr_anon = nr_file = nr_huge = 0;
747db954c   Johannes Weiner   mm: memcontrol: u...
5411
  			}
1306a85ae   Johannes Weiner   mm: embed the mem...
5412
  			memcg = page->mem_cgroup;
747db954c   Johannes Weiner   mm: memcontrol: u...
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
  		}
  
  		if (PageTransHuge(page)) {
  			nr_pages <<= compound_order(page);
  			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  			nr_huge += nr_pages;
  		}
  
  		if (PageAnon(page))
  			nr_anon += nr_pages;
  		else
  			nr_file += nr_pages;
1306a85ae   Johannes Weiner   mm: embed the mem...
5425
  		page->mem_cgroup = NULL;
747db954c   Johannes Weiner   mm: memcontrol: u...
5426
5427
5428
5429
5430
  
  		pgpgout++;
  	} while (next != page_list);
  
  	if (memcg)
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5431
5432
  		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
  			       nr_huge, page);
747db954c   Johannes Weiner   mm: memcontrol: u...
5433
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5434
5435
5436
5437
5438
5439
5440
5441
5442
  /**
   * mem_cgroup_uncharge - uncharge a page
   * @page: page to uncharge
   *
   * Uncharge a page previously charged with mem_cgroup_try_charge() and
   * mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge(struct page *page)
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5443
5444
  	if (mem_cgroup_disabled())
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
5445
  	/* Don't touch page->lru of any random page, pre-check: */
1306a85ae   Johannes Weiner   mm: embed the mem...
5446
  	if (!page->mem_cgroup)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5447
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
5448
5449
5450
  	INIT_LIST_HEAD(&page->lru);
  	uncharge_list(&page->lru);
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5451

747db954c   Johannes Weiner   mm: memcontrol: u...
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
  /**
   * mem_cgroup_uncharge_list - uncharge a list of page
   * @page_list: list of pages to uncharge
   *
   * Uncharge a list of pages previously charged with
   * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge_list(struct list_head *page_list)
  {
  	if (mem_cgroup_disabled())
  		return;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5463

747db954c   Johannes Weiner   mm: memcontrol: u...
5464
5465
  	if (!list_empty(page_list))
  		uncharge_list(page_list);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5466
5467
5468
5469
5470
5471
  }
  
  /**
   * mem_cgroup_migrate - migrate a charge to another page
   * @oldpage: currently charged page
   * @newpage: page to transfer the charge to
f5e03a498   Michal Hocko   memcg, shmem: fix...
5472
   * @lrucare: either or both pages might be on the LRU already
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5473
5474
5475
5476
5477
5478
5479
5480
   *
   * Migrate the charge from @oldpage to @newpage.
   *
   * Both pages must be locked, @newpage->mapping must be set up.
   */
  void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
  			bool lrucare)
  {
298333157   Johannes Weiner   mm: memcontrol: r...
5481
  	struct mem_cgroup *memcg;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5482
5483
5484
5485
5486
5487
5488
  	int isolated;
  
  	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
  	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
  	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
  	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
  	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5489
5490
  	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
  		       newpage);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5491
5492
5493
5494
5495
  
  	if (mem_cgroup_disabled())
  		return;
  
  	/* Page cache replacement: new page already charged? */
1306a85ae   Johannes Weiner   mm: embed the mem...
5496
  	if (newpage->mem_cgroup)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5497
  		return;
7d5e32457   Johannes Weiner   mm: memcontrol: c...
5498
5499
5500
5501
5502
5503
  	/*
  	 * Swapcache readahead pages can get migrated before being
  	 * charged, and migration from compaction can happen to an
  	 * uncharged page when the PFN walker finds a page that
  	 * reclaim just put back on the LRU but has not released yet.
  	 */
1306a85ae   Johannes Weiner   mm: embed the mem...
5504
  	memcg = oldpage->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
5505
  	if (!memcg)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5506
  		return;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5507
5508
  	if (lrucare)
  		lock_page_lru(oldpage, &isolated);
1306a85ae   Johannes Weiner   mm: embed the mem...
5509
  	oldpage->mem_cgroup = NULL;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5510
5511
5512
  
  	if (lrucare)
  		unlock_page_lru(oldpage, isolated);
298333157   Johannes Weiner   mm: memcontrol: r...
5513
  	commit_charge(newpage, memcg, lrucare);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5514
  }
2d11085e4   Michal Hocko   memcg: do not cre...
5515
  /*
1081312f9   Michal Hocko   memcg: cleanup me...
5516
5517
5518
5519
5520
5521
   * subsys_initcall() for memory controller.
   *
   * Some parts like hotcpu_notifier() have to be initialized from this context
   * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
   * everything that doesn't depend on a specific mem_cgroup structure should
   * be initialized from here.
2d11085e4   Michal Hocko   memcg: do not cre...
5522
5523
5524
   */
  static int __init mem_cgroup_init(void)
  {
95a045f63   Johannes Weiner   mm: memcontrol: c...
5525
  	int cpu, node;
2d11085e4   Michal Hocko   memcg: do not cre...
5526
  	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
95a045f63   Johannes Weiner   mm: memcontrol: c...
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
  
  	for_each_possible_cpu(cpu)
  		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
  			  drain_local_stock);
  
  	for_each_node(node) {
  		struct mem_cgroup_tree_per_node *rtpn;
  		int zone;
  
  		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
  				    node_online(node) ? node : NUMA_NO_NODE);
  
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  			struct mem_cgroup_tree_per_zone *rtpz;
  
  			rtpz = &rtpn->rb_tree_per_zone[zone];
  			rtpz->rb_root = RB_ROOT;
  			spin_lock_init(&rtpz->lock);
  		}
  		soft_limit_tree.rb_tree_per_node[node] = rtpn;
  	}
2d11085e4   Michal Hocko   memcg: do not cre...
5548
5549
5550
  	return 0;
  }
  subsys_initcall(mem_cgroup_init);
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
  
  #ifdef CONFIG_MEMCG_SWAP
  /**
   * mem_cgroup_swapout - transfer a memsw charge to swap
   * @page: page whose memsw charge to transfer
   * @entry: swap entry to move the charge to
   *
   * Transfer the memsw charge of @page to @entry.
   */
  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	unsigned short oldid;
  
  	VM_BUG_ON_PAGE(PageLRU(page), page);
  	VM_BUG_ON_PAGE(page_count(page), page);
  
  	if (!do_swap_account)
  		return;
  
  	memcg = page->mem_cgroup;
  
  	/* Readahead page, never charged */
  	if (!memcg)
  		return;
  
  	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
  	VM_BUG_ON_PAGE(oldid, page);
  	mem_cgroup_swap_statistics(memcg, true);
  
  	page->mem_cgroup = NULL;
  
  	if (!mem_cgroup_is_root(memcg))
  		page_counter_uncharge(&memcg->memory, 1);
f371763a7   Johannes Weiner   mm: memcontrol: f...
5585
  	/* Caller disabled preemption with mapping->tree_lock */
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
  	mem_cgroup_charge_statistics(memcg, page, -1);
  	memcg_check_events(memcg, page);
  }
  
  /**
   * mem_cgroup_uncharge_swap - uncharge a swap entry
   * @entry: swap entry to uncharge
   *
   * Drop the memsw charge associated with @entry.
   */
  void mem_cgroup_uncharge_swap(swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	unsigned short id;
  
  	if (!do_swap_account)
  		return;
  
  	id = swap_cgroup_record(entry, 0);
  	rcu_read_lock();
adbe427b9   Vladimir Davydov   memcg: zap mem_cg...
5606
  	memcg = mem_cgroup_from_id(id);
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
  	if (memcg) {
  		if (!mem_cgroup_is_root(memcg))
  			page_counter_uncharge(&memcg->memsw, 1);
  		mem_cgroup_swap_statistics(memcg, false);
  		css_put(&memcg->css);
  	}
  	rcu_read_unlock();
  }
  
  /* for remember boot option*/
  #ifdef CONFIG_MEMCG_SWAP_ENABLED
  static int really_do_swap_account __initdata = 1;
  #else
  static int really_do_swap_account __initdata;
  #endif
  
  static int __init enable_swap_account(char *s)
  {
  	if (!strcmp(s, "1"))
  		really_do_swap_account = 1;
  	else if (!strcmp(s, "0"))
  		really_do_swap_account = 0;
  	return 1;
  }
  __setup("swapaccount=", enable_swap_account);
  
  static struct cftype memsw_cgroup_files[] = {
  	{
  		.name = "memsw.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
  		.write = mem_cgroup_write,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.failcnt",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{ },	/* terminate */
  };
  
  static int __init mem_cgroup_swap_init(void)
  {
  	if (!mem_cgroup_disabled() && really_do_swap_account) {
  		do_swap_account = 1;
  		WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
  						  memsw_cgroup_files));
  	}
  	return 0;
  }
  subsys_initcall(mem_cgroup_swap_init);
  
  #endif /* CONFIG_MEMCG_SWAP */