Blame view

mm/memcontrol.c 153 KB
8cdea7c05   Balbir Singh   Memory controller...
1
2
3
4
5
  /* memcontrol.c - Memory Controller
   *
   * Copyright IBM Corporation, 2007
   * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   *
78fb74669   Pavel Emelianov   Memory controller...
6
7
8
   * Copyright 2007 OpenVZ SWsoft Inc
   * Author: Pavel Emelianov <xemul@openvz.org>
   *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
9
10
11
12
   * Memory thresholds
   * Copyright (C) 2009 Nokia Corporation
   * Author: Kirill A. Shutemov
   *
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
13
14
15
16
   * Kernel Memory Controller
   * Copyright (C) 2012 Parallels Inc. and Google Inc.
   * Authors: Glauber Costa and Suleiman Souhlal
   *
1575e68b3   Johannes Weiner   mm: memcontrol: u...
17
18
19
20
21
22
   * Native page reclaim
   * Charge lifetime sanitation
   * Lockless page tracking & accounting
   * Unified hierarchy configuration model
   * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
   *
8cdea7c05   Balbir Singh   Memory controller...
23
24
25
26
27
28
29
30
31
32
   * This program is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
   * the Free Software Foundation; either version 2 of the License, or
   * (at your option) any later version.
   *
   * This program is distributed in the hope that it will be useful,
   * but WITHOUT ANY WARRANTY; without even the implied warranty of
   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   * GNU General Public License for more details.
   */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
33
  #include <linux/page_counter.h>
8cdea7c05   Balbir Singh   Memory controller...
34
35
  #include <linux/memcontrol.h>
  #include <linux/cgroup.h>
78fb74669   Pavel Emelianov   Memory controller...
36
  #include <linux/mm.h>
4ffef5fef   Daisuke Nishimura   memcg: move charg...
37
  #include <linux/hugetlb.h>
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
38
  #include <linux/pagemap.h>
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
39
  #include <linux/smp.h>
8a9f3ccd2   Balbir Singh   Memory controller...
40
  #include <linux/page-flags.h>
66e1707bc   Balbir Singh   Memory controller...
41
  #include <linux/backing-dev.h>
8a9f3ccd2   Balbir Singh   Memory controller...
42
43
  #include <linux/bit_spinlock.h>
  #include <linux/rcupdate.h>
e222432bf   Balbir Singh   memcg: show memcg...
44
  #include <linux/limits.h>
b9e15bafd   Paul Gortmaker   mm: Add export.h ...
45
  #include <linux/export.h>
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
46
  #include <linux/mutex.h>
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
47
  #include <linux/rbtree.h>
b6ac57d50   Balbir Singh   memcgroup: move m...
48
  #include <linux/slab.h>
66e1707bc   Balbir Singh   Memory controller...
49
  #include <linux/swap.h>
024914477   Daisuke Nishimura   memcg: move charg...
50
  #include <linux/swapops.h>
66e1707bc   Balbir Singh   Memory controller...
51
  #include <linux/spinlock.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
52
  #include <linux/eventfd.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
53
  #include <linux/poll.h>
2e72b6347   Kirill A. Shutemov   memcg: implement ...
54
  #include <linux/sort.h>
66e1707bc   Balbir Singh   Memory controller...
55
  #include <linux/fs.h>
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
56
  #include <linux/seq_file.h>
70ddf637e   Anton Vorontsov   memcg: add memory...
57
  #include <linux/vmpressure.h>
b69408e88   Christoph Lameter   vmscan: Use an in...
58
  #include <linux/mm_inline.h>
5d1ea48bd   Johannes Weiner   mm: page_cgroup: ...
59
  #include <linux/swap_cgroup.h>
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
60
  #include <linux/cpu.h>
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
61
  #include <linux/oom.h>
0056f4e66   Johannes Weiner   mm: memcg: lockde...
62
  #include <linux/lockdep.h>
79bd9814e   Tejun Heo   cgroup, memcg: mo...
63
  #include <linux/file.h>
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
64
  #include "internal.h"
d1a4c0b37   Glauber Costa   tcp memory pressu...
65
  #include <net/sock.h>
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
66
  #include <net/ip.h>
d1a4c0b37   Glauber Costa   tcp memory pressu...
67
  #include <net/tcp_memcontrol.h>
f35c3a8ee   Qiang Huang   memcg, kmem: use ...
68
  #include "slab.h"
8cdea7c05   Balbir Singh   Memory controller...
69

8697d3319   Balbir Singh   Memory controller...
70
  #include <asm/uaccess.h>
cc8e970c3   KOSAKI Motohiro   memcg: add mm_vms...
71
  #include <trace/events/vmscan.h>
073219e99   Tejun Heo   cgroup: clean up ...
72
73
  struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  EXPORT_SYMBOL(memory_cgrp_subsys);
68ae564bb   David Rientjes   mm, memcg: avoid ...
74

a181b0e88   KAMEZAWA Hiroyuki   memcg: make globa...
75
  #define MEM_CGROUP_RECLAIM_RETRIES	5
6bbda35ce   Kirill A. Shutemov   memcg: mark more ...
76
  static struct mem_cgroup *root_mem_cgroup __read_mostly;
56161634e   Tejun Heo   memcg: add mem_cg...
77
  struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
8cdea7c05   Balbir Singh   Memory controller...
78

21afa38ee   Johannes Weiner   mm: memcontrol: c...
79
  /* Whether the swap controller is active */
c255a4580   Andrew Morton   memcg: rename con...
80
  #ifdef CONFIG_MEMCG_SWAP
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
81
  int do_swap_account __read_mostly;
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
82
  #else
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
83
  #define do_swap_account		0
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
84
  #endif
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
85
86
87
  static const char * const mem_cgroup_stat_names[] = {
  	"cache",
  	"rss",
b070e65c0   David Rientjes   mm, memcg: add rs...
88
  	"rss_huge",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
89
  	"mapped_file",
c4843a759   Greg Thelen   memcg: add per cg...
90
  	"dirty",
3ea67d06e   Sha Zhengju   memcg: add per cg...
91
  	"writeback",
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
92
93
  	"swap",
  };
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
94
95
96
97
98
99
  static const char * const mem_cgroup_events_names[] = {
  	"pgpgin",
  	"pgpgout",
  	"pgfault",
  	"pgmajfault",
  };
58cf188ed   Sha Zhengju   memcg, oom: provi...
100
101
102
103
104
105
106
  static const char * const mem_cgroup_lru_names[] = {
  	"inactive_anon",
  	"active_anon",
  	"inactive_file",
  	"active_file",
  	"unevictable",
  };
7a159cc9d   Johannes Weiner   memcg: use native...
107
108
109
110
111
112
113
114
  /*
   * Per memcg event counter is incremented at every pagein/pageout. With THP,
   * it will be incremated by the number of pages. This counter is used for
   * for trigger some periodic events. This is straightforward and better
   * than using jiffies etc. to handle periodic memcg event.
   */
  enum mem_cgroup_events_target {
  	MEM_CGROUP_TARGET_THRESH,
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
115
  	MEM_CGROUP_TARGET_SOFTLIMIT,
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
116
  	MEM_CGROUP_TARGET_NUMAINFO,
7a159cc9d   Johannes Weiner   memcg: use native...
117
118
  	MEM_CGROUP_NTARGETS,
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
119
120
121
  #define THRESHOLDS_EVENTS_TARGET 128
  #define SOFTLIMIT_EVENTS_TARGET 1024
  #define NUMAINFO_EVENTS_TARGET	1024
e9f8974f2   Johannes Weiner   memcg: break out ...
122

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
123
  struct mem_cgroup_stat_cpu {
7a159cc9d   Johannes Weiner   memcg: use native...
124
  	long count[MEM_CGROUP_STAT_NSTATS];
241994ed8   Johannes Weiner   mm: memcontrol: d...
125
  	unsigned long events[MEMCG_NR_EVENTS];
13114716c   Johannes Weiner   mm: memcg: keep r...
126
  	unsigned long nr_page_events;
7a159cc9d   Johannes Weiner   memcg: use native...
127
  	unsigned long targets[MEM_CGROUP_NTARGETS];
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
128
  };
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
129
130
  struct reclaim_iter {
  	struct mem_cgroup *position;
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
131
132
133
  	/* scan generation, increased every round-trip */
  	unsigned int generation;
  };
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
134
  /*
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
135
136
   * per-zone information in memory controller.
   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
137
  struct mem_cgroup_per_zone {
6290df545   Johannes Weiner   mm: collect LRU l...
138
  	struct lruvec		lruvec;
1eb492725   Hugh Dickins   memcg: lru_size i...
139
  	unsigned long		lru_size[NR_LRU_LISTS];
3e2f41f1f   KOSAKI Motohiro   memcg: add zone_r...
140

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
141
  	struct reclaim_iter	iter[DEF_PRIORITY + 1];
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
142

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
143
  	struct rb_node		tree_node;	/* RB tree node */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
144
  	unsigned long		usage_in_excess;/* Set to the value by which */
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
145
146
  						/* the soft limit is exceeded*/
  	bool			on_tree;
d79154bb5   Hugh Dickins   memcg: replace me...
147
  	struct mem_cgroup	*memcg;		/* Back pointer, we cannot */
4e4169535   Balbir Singh   memory controller...
148
  						/* use container_of	   */
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
149
  };
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
150
151
152
153
  
  struct mem_cgroup_per_node {
  	struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
  };
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
  /*
   * Cgroups above their limits are maintained in a RB-Tree, independent of
   * their hierarchy representation
   */
  
  struct mem_cgroup_tree_per_zone {
  	struct rb_root rb_root;
  	spinlock_t lock;
  };
  
  struct mem_cgroup_tree_per_node {
  	struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
  };
  
  struct mem_cgroup_tree {
  	struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
  };
  
  static struct mem_cgroup_tree soft_limit_tree __read_mostly;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
173
174
  struct mem_cgroup_threshold {
  	struct eventfd_ctx *eventfd;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
175
  	unsigned long threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
176
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
177
  /* For threshold */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
178
  struct mem_cgroup_threshold_ary {
748dad36d   Sha Zhengju   memcg: make thres...
179
  	/* An array index points to threshold just below or equal to usage. */
5407a5625   Phil Carmody   mm: remove unnece...
180
  	int current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
181
182
183
184
185
  	/* Size of entries[] */
  	unsigned int size;
  	/* Array of thresholds */
  	struct mem_cgroup_threshold entries[0];
  };
2c488db27   Kirill A. Shutemov   memcg: clean up m...
186
187
188
189
190
191
192
193
194
195
196
  
  struct mem_cgroup_thresholds {
  	/* Primary thresholds array */
  	struct mem_cgroup_threshold_ary *primary;
  	/*
  	 * Spare threshold array.
  	 * This is needed to make mem_cgroup_unregister_event() "never fail".
  	 * It must be able to store at least primary->size - 1 entries.
  	 */
  	struct mem_cgroup_threshold_ary *spare;
  };
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
197
198
199
200
201
  /* for OOM */
  struct mem_cgroup_eventfd_list {
  	struct list_head list;
  	struct eventfd_ctx *eventfd;
  };
2e72b6347   Kirill A. Shutemov   memcg: implement ...
202

79bd9814e   Tejun Heo   cgroup, memcg: mo...
203
204
205
  /*
   * cgroup_event represents events which userspace want to receive.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
206
  struct mem_cgroup_event {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
207
  	/*
59b6f8734   Tejun Heo   memcg: make cgrou...
208
  	 * memcg which the event belongs to.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
209
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
210
  	struct mem_cgroup *memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
211
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
212
213
214
215
216
217
218
219
  	 * eventfd to signal userspace about the event.
  	 */
  	struct eventfd_ctx *eventfd;
  	/*
  	 * Each of these stored in a list by the cgroup.
  	 */
  	struct list_head list;
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
220
221
222
223
  	 * register_event() callback will be used to add new userspace
  	 * waiter for changes related to this event.  Use eventfd_signal()
  	 * on eventfd to send notification to userspace.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
224
  	int (*register_event)(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
225
  			      struct eventfd_ctx *eventfd, const char *args);
fba948078   Tejun Heo   cgroup, memcg: mo...
226
227
228
229
230
  	/*
  	 * unregister_event() callback will be called when userspace closes
  	 * the eventfd or on cgroup removing.  This callback must be set,
  	 * if you want provide notification functionality.
  	 */
59b6f8734   Tejun Heo   memcg: make cgrou...
231
  	void (*unregister_event)(struct mem_cgroup *memcg,
fba948078   Tejun Heo   cgroup, memcg: mo...
232
233
  				 struct eventfd_ctx *eventfd);
  	/*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
234
235
236
237
238
239
240
241
  	 * All fields below needed to unregister event when
  	 * userspace closes eventfd.
  	 */
  	poll_table pt;
  	wait_queue_head_t *wqh;
  	wait_queue_t wait;
  	struct work_struct remove;
  };
c0ff4b854   Raghavendra K T   memcg: rename mem...
242
243
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
244

f64c3f549   Balbir Singh   memory controller...
245
  /*
8cdea7c05   Balbir Singh   Memory controller...
246
247
248
249
   * The memory controller data structure. The memory controller controls both
   * page cache and RSS per cgroup. We would eventually like to provide
   * statistics based on the statistics developed by Rik Van Riel for clock-pro,
   * to help the administrator determine what knobs to tune.
8cdea7c05   Balbir Singh   Memory controller...
250
251
252
   */
  struct mem_cgroup {
  	struct cgroup_subsys_state css;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
253
254
255
256
257
  
  	/* Accounted resources */
  	struct page_counter memory;
  	struct page_counter memsw;
  	struct page_counter kmem;
241994ed8   Johannes Weiner   mm: memcontrol: d...
258
259
260
  	/* Normal memory consumption range */
  	unsigned long low;
  	unsigned long high;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
261
  	unsigned long soft_limit;
59927fb98   Hugh Dickins   memcg: free mem_c...
262

70ddf637e   Anton Vorontsov   memcg: add memory...
263
264
  	/* vmpressure notifications */
  	struct vmpressure vmpressure;
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
265
266
  	/* css_online() has been completed */
  	int initialized;
465939a1f   Li Zefan   memcg: don't need...
267
  	/*
18f59ea7d   Balbir Singh   memcg: memory cgr...
268
269
270
  	 * Should the accounting and control be hierarchical, per subtree?
  	 */
  	bool use_hierarchy;
79dfdaccd   Michal Hocko   memcg: make oom_l...
271

c2b42d3ca   Tejun Heo   memcg: convert me...
272
  	/* protected by memcg_oom_lock */
79dfdaccd   Michal Hocko   memcg: make oom_l...
273
  	bool		oom_lock;
c2b42d3ca   Tejun Heo   memcg: convert me...
274
  	int		under_oom;
79dfdaccd   Michal Hocko   memcg: make oom_l...
275

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
276
  	int	swappiness;
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
277
278
  	/* OOM-Killer disable */
  	int		oom_kill_disable;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
279

2e72b6347   Kirill A. Shutemov   memcg: implement ...
280
281
282
283
  	/* protect arrays of thresholds */
  	struct mutex thresholds_lock;
  
  	/* thresholds for memory usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
284
  	struct mem_cgroup_thresholds thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
285

2e72b6347   Kirill A. Shutemov   memcg: implement ...
286
  	/* thresholds for mem+swap usage. RCU-protected */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
287
  	struct mem_cgroup_thresholds memsw_thresholds;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
288

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
289
290
  	/* For oom notifier event fd */
  	struct list_head oom_notify;
185efc0f9   Johannes Weiner   memcg: Revert "me...
291

d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
292
  	/*
7dc74be03   Daisuke Nishimura   memcg: add interf...
293
294
295
  	 * Should we move charges of a task when a task is moved into this
  	 * mem_cgroup ? And what type of charges should we move ?
  	 */
f894ffa86   Andrew Morton   memcg: trivial cl...
296
  	unsigned long move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
297
  	/*
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
298
299
  	 * set > 0 if pages under this cgroup are moving to other cgroup.
  	 */
6de226191   Johannes Weiner   mm: memcontrol: t...
300
  	atomic_t		moving_account;
312734c04   KAMEZAWA Hiroyuki   memcg: remove PCG...
301
  	/* taken only while moving_account > 0 */
6de226191   Johannes Weiner   mm: memcontrol: t...
302
303
304
  	spinlock_t		move_lock;
  	struct task_struct	*move_lock_task;
  	unsigned long		move_lock_flags;
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
305
  	/*
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
306
  	 * percpu counter.
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
307
  	 */
3a7951b4c   Kirill A. Shutemov   memcg: mark stat ...
308
  	struct mem_cgroup_stat_cpu __percpu *stat;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
309
  	spinlock_t pcp_counter_lock;
d1a4c0b37   Glauber Costa   tcp memory pressu...
310

4bd2c1ee4   Michal Hocko   memcg: cleanup km...
311
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
2e685cad5   Eric W. Biederman   tcp_memcontrol: K...
312
  	struct cg_proto tcp_mem;
d1a4c0b37   Glauber Costa   tcp memory pressu...
313
  #endif
2633d7a02   Glauber Costa   slab/slub: consid...
314
  #if defined(CONFIG_MEMCG_KMEM)
f7ce3190c   Vladimir Davydov   slab: embed memcg...
315
          /* Index in the kmem_cache->memcg_params.memcg_caches array */
2633d7a02   Glauber Costa   slab/slub: consid...
316
  	int kmemcg_id;
2788cf0c4   Vladimir Davydov   memcg: reparent l...
317
  	bool kmem_acct_activated;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
318
  	bool kmem_acct_active;
2633d7a02   Glauber Costa   slab/slub: consid...
319
  #endif
45cf7ebd5   Glauber Costa   memcg: reduce the...
320
321
322
323
324
325
326
  
  	int last_scanned_node;
  #if MAX_NUMNODES > 1
  	nodemask_t	scan_nodes;
  	atomic_t	numainfo_events;
  	atomic_t	numainfo_updating;
  #endif
70ddf637e   Anton Vorontsov   memcg: add memory...
327

52ebea749   Tejun Heo   writeback: make b...
328
329
  #ifdef CONFIG_CGROUP_WRITEBACK
  	struct list_head cgwb_list;
841710aa6   Tejun Heo   writeback: implem...
330
  	struct wb_domain cgwb_domain;
52ebea749   Tejun Heo   writeback: make b...
331
  #endif
fba948078   Tejun Heo   cgroup, memcg: mo...
332
333
334
  	/* List of events which userspace want to receive */
  	struct list_head event_list;
  	spinlock_t event_list_lock;
54f72fe02   Johannes Weiner   memcg: clean up m...
335
336
  	struct mem_cgroup_per_node *nodeinfo[0];
  	/* WARNING: nodeinfo must be the last member here */
8cdea7c05   Balbir Singh   Memory controller...
337
  };
510fc4e11   Glauber Costa   memcg: kmem accou...
338
  #ifdef CONFIG_MEMCG_KMEM
cb731d6c6   Vladimir Davydov   vmscan: per memor...
339
  bool memcg_kmem_is_active(struct mem_cgroup *memcg)
7de37682b   Glauber Costa   memcg: kmem accou...
340
  {
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
341
  	return memcg->kmem_acct_active;
7de37682b   Glauber Costa   memcg: kmem accou...
342
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
343
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
344
345
  /* Stuffs for move charges at task migration. */
  /*
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
346
   * Types of charges to be moved.
7dc74be03   Daisuke Nishimura   memcg: add interf...
347
   */
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
348
349
350
  #define MOVE_ANON	0x1U
  #define MOVE_FILE	0x2U
  #define MOVE_MASK	(MOVE_ANON | MOVE_FILE)
7dc74be03   Daisuke Nishimura   memcg: add interf...
351

4ffef5fef   Daisuke Nishimura   memcg: move charg...
352
353
  /* "mc" and its members are protected by cgroup_mutex */
  static struct move_charge_struct {
b1dd693e5   Daisuke Nishimura   memcg: avoid dead...
354
  	spinlock_t	  lock; /* for from, to */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
355
356
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
357
  	unsigned long flags;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
358
  	unsigned long precharge;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
359
  	unsigned long moved_charge;
483c30b51   Daisuke Nishimura   memcg: improve pe...
360
  	unsigned long moved_swap;
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
361
362
363
  	struct task_struct *moving_task;	/* a task moving charges */
  	wait_queue_head_t waitq;		/* a waitq for other context */
  } mc = {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
364
  	.lock = __SPIN_LOCK_UNLOCKED(mc.lock),
8033b97c9   Daisuke Nishimura   memcg: avoid oom ...
365
366
  	.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
367

4e4169535   Balbir Singh   memory controller...
368
369
370
371
  /*
   * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
   * limit reclaim to prevent infinite loops, if they ever occur.
   */
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
372
  #define	MEM_CGROUP_MAX_RECLAIM_LOOPS		100
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
373
  #define	MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS	2
4e4169535   Balbir Singh   memory controller...
374

217bc3194   KAMEZAWA Hiroyuki   memory cgroup enh...
375
376
  enum charge_type {
  	MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
41326c17f   Kamezawa Hiroyuki   memcg: rename MEM...
377
  	MEM_CGROUP_CHARGE_TYPE_ANON,
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
378
  	MEM_CGROUP_CHARGE_TYPE_SWAPOUT,	/* for accounting swapcache */
8a9478ca7   KAMEZAWA Hiroyuki   memcg: fix swap a...
379
  	MEM_CGROUP_CHARGE_TYPE_DROP,	/* a page was unused swap cache */
c05555b57   KAMEZAWA Hiroyuki   memcg: atomic ops...
380
381
  	NR_CHARGE_TYPE,
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
382
  /* for encoding cft->private value on file */
86ae53e1a   Glauber Costa   memcg: change def...
383
384
385
386
  enum res_type {
  	_MEM,
  	_MEMSWAP,
  	_OOM_TYPE,
510fc4e11   Glauber Costa   memcg: kmem accou...
387
  	_KMEM,
86ae53e1a   Glauber Costa   memcg: change def...
388
  };
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
389
390
  #define MEMFILE_PRIVATE(x, val)	((x) << 16 | (val))
  #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
391
  #define MEMFILE_ATTR(val)	((val) & 0xffff)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
392
393
  /* Used for OOM nofiier */
  #define OOM_CONTROL		(0)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
394

75822b449   Balbir Singh   memory controller...
395
  /*
0999821b1   Glauber Costa   memcg: replace cg...
396
397
398
399
400
   * The memcg_create_mutex will be held whenever a new cgroup is created.
   * As a consequence, any change that needs to protect against new child cgroups
   * appearing has to hold it as well.
   */
  static DEFINE_MUTEX(memcg_create_mutex);
b21451459   Wanpeng Li   memcg: add mem_cg...
401
402
  struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
  {
a7c6d554a   Tejun Heo   cgroup: add/updat...
403
  	return s ? container_of(s, struct mem_cgroup, css) : NULL;
b21451459   Wanpeng Li   memcg: add mem_cg...
404
  }
70ddf637e   Anton Vorontsov   memcg: add memory...
405
406
407
408
409
410
411
412
413
414
415
416
  /* Some nice accessors for the vmpressure. */
  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
  {
  	if (!memcg)
  		memcg = root_mem_cgroup;
  	return &memcg->vmpressure;
  }
  
  struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
  {
  	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
  }
7ffc0edc4   Michal Hocko   memcg: move mem_c...
417
418
419
420
  static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
  {
  	return (memcg == root_mem_cgroup);
  }
4219b2da2   Li Zefan   memcg: fail to cr...
421
422
423
424
425
  /*
   * We restrict the id in the range of [1, 65535], so it can fit into
   * an unsigned short.
   */
  #define MEM_CGROUP_ID_MAX	USHRT_MAX
34c00c319   Li Zefan   memcg: convert to...
426
427
  static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
  {
15a4c835e   Tejun Heo   cgroup, memcg: im...
428
  	return memcg->css.id;
34c00c319   Li Zefan   memcg: convert to...
429
  }
adbe427b9   Vladimir Davydov   memcg: zap mem_cg...
430
431
432
433
434
435
  /*
   * A helper function to get mem_cgroup from ID. must be called under
   * rcu_read_lock().  The caller is responsible for calling
   * css_tryget_online() if the mem_cgroup is used for charging. (dropping
   * refcnt from swap can be called against removed memcg.)
   */
34c00c319   Li Zefan   memcg: convert to...
436
437
438
  static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
  {
  	struct cgroup_subsys_state *css;
7d699ddb2   Tejun Heo   cgroup, memcg: al...
439
  	css = css_from_id(id, &memory_cgrp_subsys);
34c00c319   Li Zefan   memcg: convert to...
440
441
  	return mem_cgroup_from_css(css);
  }
e1aab161e   Glauber Costa   socket: initial c...
442
  /* Writing them here to avoid exposing memcg's inner layout */
4bd2c1ee4   Michal Hocko   memcg: cleanup km...
443
  #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
e1aab161e   Glauber Costa   socket: initial c...
444

e1aab161e   Glauber Costa   socket: initial c...
445
446
  void sock_update_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
447
  	if (mem_cgroup_sockets_enabled) {
e1aab161e   Glauber Costa   socket: initial c...
448
  		struct mem_cgroup *memcg;
3f1346193   Glauber Costa   memcg: decrement ...
449
  		struct cg_proto *cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
450
451
  
  		BUG_ON(!sk->sk_prot->proto_cgroup);
f3f511e1c   Glauber Costa   net: fix sock_clo...
452
453
454
455
456
457
458
459
460
461
  		/* Socket cloning can throw us here with sk_cgrp already
  		 * filled. It won't however, necessarily happen from
  		 * process context. So the test for root memcg given
  		 * the current task's memcg won't help us in this case.
  		 *
  		 * Respecting the original socket's memcg is a better
  		 * decision in this case.
  		 */
  		if (sk->sk_cgrp) {
  			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
5347e5ae1   Li Zefan   memcg: use css_ge...
462
  			css_get(&sk->sk_cgrp->memcg->css);
f3f511e1c   Glauber Costa   net: fix sock_clo...
463
464
  			return;
  		}
e1aab161e   Glauber Costa   socket: initial c...
465
466
  		rcu_read_lock();
  		memcg = mem_cgroup_from_task(current);
3f1346193   Glauber Costa   memcg: decrement ...
467
  		cg_proto = sk->sk_prot->proto_cgroup(memcg);
5347e5ae1   Li Zefan   memcg: use css_ge...
468
  		if (!mem_cgroup_is_root(memcg) &&
ec903c0c8   Tejun Heo   cgroup: rename cs...
469
470
  		    memcg_proto_active(cg_proto) &&
  		    css_tryget_online(&memcg->css)) {
3f1346193   Glauber Costa   memcg: decrement ...
471
  			sk->sk_cgrp = cg_proto;
e1aab161e   Glauber Costa   socket: initial c...
472
473
474
475
476
477
478
479
  		}
  		rcu_read_unlock();
  	}
  }
  EXPORT_SYMBOL(sock_update_memcg);
  
  void sock_release_memcg(struct sock *sk)
  {
376be5ff8   Glauber Costa   net: fix socket m...
480
  	if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
e1aab161e   Glauber Costa   socket: initial c...
481
482
483
  		struct mem_cgroup *memcg;
  		WARN_ON(!sk->sk_cgrp->memcg);
  		memcg = sk->sk_cgrp->memcg;
5347e5ae1   Li Zefan   memcg: use css_ge...
484
  		css_put(&sk->sk_cgrp->memcg->css);
e1aab161e   Glauber Costa   socket: initial c...
485
486
  	}
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
487
488
489
490
491
  
  struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
  {
  	if (!memcg || mem_cgroup_is_root(memcg))
  		return NULL;
2e685cad5   Eric W. Biederman   tcp_memcontrol: K...
492
  	return &memcg->tcp_mem;
d1a4c0b37   Glauber Costa   tcp memory pressu...
493
494
  }
  EXPORT_SYMBOL(tcp_proto_cgroup);
e1aab161e   Glauber Costa   socket: initial c...
495

3f1346193   Glauber Costa   memcg: decrement ...
496
  #endif
a8964b9b8   Glauber Costa   memcg: use static...
497
  #ifdef CONFIG_MEMCG_KMEM
55007d849   Glauber Costa   memcg: allocate m...
498
  /*
f7ce3190c   Vladimir Davydov   slab: embed memcg...
499
   * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
b86278359   Li Zefan   memcg: stop using...
500
501
502
503
504
   * The main reason for not using cgroup id for this:
   *  this works better in sparse environments, where we have a lot of memcgs,
   *  but only a few kmem-limited. Or also, if we have, for instance, 200
   *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
   *  200 entry array for that.
55007d849   Glauber Costa   memcg: allocate m...
505
   *
dbcf73e26   Vladimir Davydov   memcg: rename som...
506
507
   * The current size of the caches array is stored in memcg_nr_cache_ids. It
   * will double each time we have to increase it.
55007d849   Glauber Costa   memcg: allocate m...
508
   */
dbcf73e26   Vladimir Davydov   memcg: rename som...
509
510
  static DEFINE_IDA(memcg_cache_ida);
  int memcg_nr_cache_ids;
749c54151   Glauber Costa   memcg: aggregate ...
511

05257a1a3   Vladimir Davydov   memcg: add rwsem ...
512
513
514
515
516
517
518
519
520
521
522
523
  /* Protects memcg_nr_cache_ids */
  static DECLARE_RWSEM(memcg_cache_ids_sem);
  
  void memcg_get_cache_ids(void)
  {
  	down_read(&memcg_cache_ids_sem);
  }
  
  void memcg_put_cache_ids(void)
  {
  	up_read(&memcg_cache_ids_sem);
  }
55007d849   Glauber Costa   memcg: allocate m...
524
525
526
527
528
529
  /*
   * MIN_SIZE is different than 1, because we would like to avoid going through
   * the alloc/free process all the time. In a small machine, 4 kmem-limited
   * cgroups is a reasonable guess. In the future, it could be a parameter or
   * tunable, but that is strictly not necessary.
   *
b86278359   Li Zefan   memcg: stop using...
530
   * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
55007d849   Glauber Costa   memcg: allocate m...
531
532
   * this constant directly from cgroup, but it is understandable that this is
   * better kept as an internal representation in cgroup.c. In any case, the
b86278359   Li Zefan   memcg: stop using...
533
   * cgrp_id space is not getting any smaller, and we don't have to necessarily
55007d849   Glauber Costa   memcg: allocate m...
534
535
536
   * increase ours as well if it increases.
   */
  #define MEMCG_CACHES_MIN_SIZE 4
b86278359   Li Zefan   memcg: stop using...
537
  #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
55007d849   Glauber Costa   memcg: allocate m...
538

d7f25f8a2   Glauber Costa   memcg: infrastruc...
539
540
541
542
543
544
  /*
   * A lot of the calls to the cache allocation functions are expected to be
   * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
   * conditional to this static branch, we'll have to allow modules that does
   * kmem_cache_alloc and the such to see this symbol as well
   */
a8964b9b8   Glauber Costa   memcg: use static...
545
  struct static_key memcg_kmem_enabled_key;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
546
  EXPORT_SYMBOL(memcg_kmem_enabled_key);
a8964b9b8   Glauber Costa   memcg: use static...
547

a8964b9b8   Glauber Costa   memcg: use static...
548
  #endif /* CONFIG_MEMCG_KMEM */
f64c3f549   Balbir Singh   memory controller...
549
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
550
  mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
f64c3f549   Balbir Singh   memory controller...
551
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
552
553
  	int nid = zone_to_nid(zone);
  	int zid = zone_idx(zone);
54f72fe02   Johannes Weiner   memcg: clean up m...
554
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
555
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
556
  struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
d324236b3   Wu Fengguang   memcg: add access...
557
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
558
  	return &memcg->css;
d324236b3   Wu Fengguang   memcg: add access...
559
  }
ad7fa852d   Tejun Heo   memcg: implement ...
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
  /**
   * mem_cgroup_css_from_page - css of the memcg associated with a page
   * @page: page of interest
   *
   * If memcg is bound to the default hierarchy, css of the memcg associated
   * with @page is returned.  The returned css remains associated with @page
   * until it is released.
   *
   * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
   * is returned.
   *
   * XXX: The above description of behavior on the default hierarchy isn't
   * strictly true yet as replace_page_cache_page() can modify the
   * association before @page is released even on the default hierarchy;
   * however, the current and planned usages don't mix the the two functions
   * and replace_page_cache_page() will soon be updated to make the invariant
   * actually true.
   */
  struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
  {
  	struct mem_cgroup *memcg;
  
  	rcu_read_lock();
  
  	memcg = page->mem_cgroup;
  
  	if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
  		memcg = root_mem_cgroup;
  
  	rcu_read_unlock();
  	return &memcg->css;
  }
f64c3f549   Balbir Singh   memory controller...
592
  static struct mem_cgroup_per_zone *
e231875ba   Jianyu Zhan   mm: memcontrol: c...
593
  mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
f64c3f549   Balbir Singh   memory controller...
594
  {
97a6c37b3   Johannes Weiner   memcg: change pag...
595
596
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
f64c3f549   Balbir Singh   memory controller...
597

e231875ba   Jianyu Zhan   mm: memcontrol: c...
598
  	return &memcg->nodeinfo[nid]->zoneinfo[zid];
f64c3f549   Balbir Singh   memory controller...
599
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
600
601
602
603
604
605
606
607
608
609
610
611
612
613
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_node_zone(int nid, int zid)
  {
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
  
  static struct mem_cgroup_tree_per_zone *
  soft_limit_tree_from_page(struct page *page)
  {
  	int nid = page_to_nid(page);
  	int zid = page_zonenum(page);
  
  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
614
615
  static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
616
  					 unsigned long new_usage_in_excess)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
  {
  	struct rb_node **p = &mctz->rb_root.rb_node;
  	struct rb_node *parent = NULL;
  	struct mem_cgroup_per_zone *mz_node;
  
  	if (mz->on_tree)
  		return;
  
  	mz->usage_in_excess = new_usage_in_excess;
  	if (!mz->usage_in_excess)
  		return;
  	while (*p) {
  		parent = *p;
  		mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
  					tree_node);
  		if (mz->usage_in_excess < mz_node->usage_in_excess)
  			p = &(*p)->rb_left;
  		/*
  		 * We can't avoid mem cgroups that are over their soft
  		 * limit by the same amount
  		 */
  		else if (mz->usage_in_excess >= mz_node->usage_in_excess)
  			p = &(*p)->rb_right;
  	}
  	rb_link_node(&mz->tree_node, parent, p);
  	rb_insert_color(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = true;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
645
646
  static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  					 struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
647
648
649
650
651
652
  {
  	if (!mz->on_tree)
  		return;
  	rb_erase(&mz->tree_node, &mctz->rb_root);
  	mz->on_tree = false;
  }
cf2c81279   Johannes Weiner   mm: memcontrol: r...
653
654
  static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
  				       struct mem_cgroup_tree_per_zone *mctz)
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
655
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
656
657
658
  	unsigned long flags;
  
  	spin_lock_irqsave(&mctz->lock, flags);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
659
  	__mem_cgroup_remove_exceeded(mz, mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
660
  	spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
661
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
662
663
664
  static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
  {
  	unsigned long nr_pages = page_counter_read(&memcg->memory);
4db0c3c29   Jason Low   mm: remove rest o...
665
  	unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
666
667
668
669
670
671
672
  	unsigned long excess = 0;
  
  	if (nr_pages > soft_limit)
  		excess = nr_pages - soft_limit;
  
  	return excess;
  }
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
673
674
675
  
  static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
676
  	unsigned long excess;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
677
678
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup_tree_per_zone *mctz;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
679

e231875ba   Jianyu Zhan   mm: memcontrol: c...
680
  	mctz = soft_limit_tree_from_page(page);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
681
682
683
684
685
  	/*
  	 * Necessary to update all ancestors when hierarchy is used.
  	 * because their event counter is not touched.
  	 */
  	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
686
  		mz = mem_cgroup_page_zoneinfo(memcg, page);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
687
  		excess = soft_limit_excess(memcg);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
688
689
690
691
692
  		/*
  		 * We have to update the tree if mz is on RB-tree or
  		 * mem is over its softlimit.
  		 */
  		if (excess || mz->on_tree) {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
693
694
695
  			unsigned long flags;
  
  			spin_lock_irqsave(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
696
697
  			/* if on-tree, remove it */
  			if (mz->on_tree)
cf2c81279   Johannes Weiner   mm: memcontrol: r...
698
  				__mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
699
700
701
702
  			/*
  			 * Insert again. mz->usage_in_excess will be updated.
  			 * If excess is 0, no tree ops.
  			 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
703
  			__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
704
  			spin_unlock_irqrestore(&mctz->lock, flags);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
705
706
707
708
709
710
  		}
  	}
  }
  
  static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
  {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
711
  	struct mem_cgroup_tree_per_zone *mctz;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
712
713
  	struct mem_cgroup_per_zone *mz;
  	int nid, zid;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
714

e231875ba   Jianyu Zhan   mm: memcontrol: c...
715
716
717
718
  	for_each_node(nid) {
  		for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			mctz = soft_limit_tree_node_zone(nid, zid);
cf2c81279   Johannes Weiner   mm: memcontrol: r...
719
  			mem_cgroup_remove_exceeded(mz, mctz);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
  		}
  	}
  }
  
  static struct mem_cgroup_per_zone *
  __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct rb_node *rightmost = NULL;
  	struct mem_cgroup_per_zone *mz;
  
  retry:
  	mz = NULL;
  	rightmost = rb_last(&mctz->rb_root);
  	if (!rightmost)
  		goto done;		/* Nothing to reclaim from */
  
  	mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
  	/*
  	 * Remove the node now but someone else can add it back,
  	 * we will to add it back at the end of reclaim to its correct
  	 * position in the tree.
  	 */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
742
  	__mem_cgroup_remove_exceeded(mz, mctz);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
743
  	if (!soft_limit_excess(mz->memcg) ||
ec903c0c8   Tejun Heo   cgroup: rename cs...
744
  	    !css_tryget_online(&mz->memcg->css))
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
745
746
747
748
749
750
751
752
753
  		goto retry;
  done:
  	return mz;
  }
  
  static struct mem_cgroup_per_zone *
  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  {
  	struct mem_cgroup_per_zone *mz;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
754
  	spin_lock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
755
  	mz = __mem_cgroup_largest_soft_limit_node(mctz);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
756
  	spin_unlock_irq(&mctz->lock);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
757
758
  	return mz;
  }
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
  /*
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
   * a periodic synchronizion of counter in memcg's counter.
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
   * he accounts memory. Even if we provide quick-and-fuzzy read, we always
   * have to visit all online cpus and make sum. So, for now, unnecessary
   * synchronization is not implemented. (just implemented for cpu hotplug)
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
   * common workload, threashold and synchonization as vmstat[] should be
   * implemented.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
778
  static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
7a159cc9d   Johannes Weiner   memcg: use native...
779
  				 enum mem_cgroup_stat_index idx)
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
780
  {
7a159cc9d   Johannes Weiner   memcg: use native...
781
  	long val = 0;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
782
  	int cpu;
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
783

733a572e6   Tejun Heo   memcg: make mem_c...
784
  	for_each_possible_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
785
  		val += per_cpu(memcg->stat->count[idx], cpu);
c62b1a3b3   KAMEZAWA Hiroyuki   memcg: use generi...
786
787
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
788
  static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
e9f8974f2   Johannes Weiner   memcg: break out ...
789
790
791
792
  					    enum mem_cgroup_events_index idx)
  {
  	unsigned long val = 0;
  	int cpu;
733a572e6   Tejun Heo   memcg: make mem_c...
793
  	for_each_possible_cpu(cpu)
c0ff4b854   Raghavendra K T   memcg: rename mem...
794
  		val += per_cpu(memcg->stat->events[idx], cpu);
e9f8974f2   Johannes Weiner   memcg: break out ...
795
796
  	return val;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
797
  static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
b070e65c0   David Rientjes   mm, memcg: add rs...
798
  					 struct page *page,
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
799
  					 int nr_pages)
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
800
  {
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
801
802
803
804
  	/*
  	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
  	 * counted as CACHE even if it's on ANON LRU.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
805
  	if (PageAnon(page))
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
806
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
c0ff4b854   Raghavendra K T   memcg: rename mem...
807
  				nr_pages);
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
808
  	else
b24028572   KAMEZAWA Hiroyuki   memcg: remove PCG...
809
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
c0ff4b854   Raghavendra K T   memcg: rename mem...
810
  				nr_pages);
55e462b05   Balaji Rao   memcg: simple sta...
811

b070e65c0   David Rientjes   mm, memcg: add rs...
812
813
814
  	if (PageTransHuge(page))
  		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
  				nr_pages);
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
815
816
  	/* pagein of a big page is an event. So, ignore page size */
  	if (nr_pages > 0)
c0ff4b854   Raghavendra K T   memcg: rename mem...
817
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
818
  	else {
c0ff4b854   Raghavendra K T   memcg: rename mem...
819
  		__this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
3751d6043   KAMEZAWA Hiroyuki   memcg: fix event ...
820
821
  		nr_pages = -nr_pages; /* for event */
  	}
e401f1761   KAMEZAWA Hiroyuki   memcg: modify acc...
822

13114716c   Johannes Weiner   mm: memcg: keep r...
823
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
824
  }
e231875ba   Jianyu Zhan   mm: memcontrol: c...
825
  unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
074291fea   Konstantin Khlebnikov   mm/vmscan: replac...
826
827
828
829
830
831
  {
  	struct mem_cgroup_per_zone *mz;
  
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	return mz->lru_size[lru];
  }
e231875ba   Jianyu Zhan   mm: memcontrol: c...
832
833
834
  static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
  						  int nid,
  						  unsigned int lru_mask)
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
835
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
836
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
837
  	int zid;
e231875ba   Jianyu Zhan   mm: memcontrol: c...
838
  	VM_BUG_ON((unsigned)nid >= nr_node_ids);
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
839

e231875ba   Jianyu Zhan   mm: memcontrol: c...
840
841
842
843
844
845
846
847
848
849
850
851
  	for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  		struct mem_cgroup_per_zone *mz;
  		enum lru_list lru;
  
  		for_each_lru(lru) {
  			if (!(BIT(lru) & lru_mask))
  				continue;
  			mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
  			nr += mz->lru_size[lru];
  		}
  	}
  	return nr;
889976dbc   Ying Han   memcg: reclaim me...
852
  }
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
853

c0ff4b854   Raghavendra K T   memcg: rename mem...
854
  static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
bb2a0de92   KAMEZAWA Hiroyuki   memcg: consolidat...
855
  			unsigned int lru_mask)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
856
  {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
857
  	unsigned long nr = 0;
889976dbc   Ying Han   memcg: reclaim me...
858
  	int nid;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
859

31aaea4aa   Lai Jiangshan   memcontrol: use N...
860
  	for_each_node_state(nid, N_MEMORY)
e231875ba   Jianyu Zhan   mm: memcontrol: c...
861
862
  		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
  	return nr;
d52aa412d   KAMEZAWA Hiroyuki   memory cgroup enh...
863
  }
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
864
865
  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  				       enum mem_cgroup_events_target target)
7a159cc9d   Johannes Weiner   memcg: use native...
866
867
  {
  	unsigned long val, next;
13114716c   Johannes Weiner   mm: memcg: keep r...
868
  	val = __this_cpu_read(memcg->stat->nr_page_events);
4799401fe   Steven Rostedt   memcg: Fix race c...
869
  	next = __this_cpu_read(memcg->stat->targets[target]);
7a159cc9d   Johannes Weiner   memcg: use native...
870
  	/* from time_after() in jiffies.h */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
871
872
873
874
875
  	if ((long)next - (long)val < 0) {
  		switch (target) {
  		case MEM_CGROUP_TARGET_THRESH:
  			next = val + THRESHOLDS_EVENTS_TARGET;
  			break;
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
876
877
878
  		case MEM_CGROUP_TARGET_SOFTLIMIT:
  			next = val + SOFTLIMIT_EVENTS_TARGET;
  			break;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
879
880
881
882
883
884
885
886
  		case MEM_CGROUP_TARGET_NUMAINFO:
  			next = val + NUMAINFO_EVENTS_TARGET;
  			break;
  		default:
  			break;
  		}
  		__this_cpu_write(memcg->stat->targets[target], next);
  		return true;
7a159cc9d   Johannes Weiner   memcg: use native...
887
  	}
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
888
  	return false;
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
889
890
891
892
893
894
  }
  
  /*
   * Check events in order.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
895
  static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
896
897
  {
  	/* threshold event is triggered in finer grain than soft limit */
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
898
899
  	if (unlikely(mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_THRESH))) {
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
900
  		bool do_softlimit;
82b3f2a71   Andrew Morton   mm/memcontrol.c: ...
901
  		bool do_numainfo __maybe_unused;
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
902

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
903
904
  		do_softlimit = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_SOFTLIMIT);
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
905
906
907
908
  #if MAX_NUMNODES > 1
  		do_numainfo = mem_cgroup_event_ratelimit(memcg,
  						MEM_CGROUP_TARGET_NUMAINFO);
  #endif
c0ff4b854   Raghavendra K T   memcg: rename mem...
909
  		mem_cgroup_threshold(memcg);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
910
911
  		if (unlikely(do_softlimit))
  			mem_cgroup_update_tree(memcg, page);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
912
  #if MAX_NUMNODES > 1
f53d7ce32   Johannes Weiner   mm: memcg: shorte...
913
  		if (unlikely(do_numainfo))
c0ff4b854   Raghavendra K T   memcg: rename mem...
914
  			atomic_inc(&memcg->numainfo_events);
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
915
  #endif
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
916
  	}
d2265e6fa   KAMEZAWA Hiroyuki   memcg : share eve...
917
  }
cf475ad28   Balbir Singh   cgroups: add an o...
918
  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
78fb74669   Pavel Emelianov   Memory controller...
919
  {
31a78f23b   Balbir Singh   mm owner: fix rac...
920
921
922
923
924
925
926
  	/*
  	 * mm_update_next_owner() may clear mm->owner to NULL
  	 * if it races with swapoff, page migration, etc.
  	 * So this can be called with p == NULL.
  	 */
  	if (unlikely(!p))
  		return NULL;
073219e99   Tejun Heo   cgroup: clean up ...
927
  	return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
78fb74669   Pavel Emelianov   Memory controller...
928
  }
df3819754   Johannes Weiner   memcg: get_mem_cg...
929
  static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
930
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
931
  	struct mem_cgroup *memcg = NULL;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
932

54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
933
934
  	rcu_read_lock();
  	do {
6f6acb005   Michal Hocko   memcg: fix swapca...
935
936
937
938
939
940
  		/*
  		 * Page cache insertions can happen withou an
  		 * actual mm context, e.g. during disk probing
  		 * on boot, loopback IO, acct() writes etc.
  		 */
  		if (unlikely(!mm))
df3819754   Johannes Weiner   memcg: get_mem_cg...
941
  			memcg = root_mem_cgroup;
6f6acb005   Michal Hocko   memcg: fix swapca...
942
943
944
945
946
  		else {
  			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  			if (unlikely(!memcg))
  				memcg = root_mem_cgroup;
  		}
ec903c0c8   Tejun Heo   cgroup: rename cs...
947
  	} while (!css_tryget_online(&memcg->css));
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
948
  	rcu_read_unlock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
949
  	return memcg;
54595fe26   KAMEZAWA Hiroyuki   memcg: use css_tr...
950
  }
5660048cc   Johannes Weiner   mm: move memcg hi...
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
  /**
   * mem_cgroup_iter - iterate over memory cgroup hierarchy
   * @root: hierarchy root
   * @prev: previously returned memcg, NULL on first invocation
   * @reclaim: cookie for shared reclaim walks, NULL for full walks
   *
   * Returns references to children of the hierarchy below @root, or
   * @root itself, or %NULL after a full round-trip.
   *
   * Caller must pass the return value in @prev on subsequent
   * invocations for reference counting, or use mem_cgroup_iter_break()
   * to cancel a hierarchy walk before the round-trip is complete.
   *
   * Reclaimers can specify a zone and a priority level in @reclaim to
   * divide up the memcgs in the hierarchy among all concurrent
   * reclaimers operating on the same zone and priority.
   */
694fbc0fe   Andrew Morton   revert "memcg: en...
968
  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
5660048cc   Johannes Weiner   mm: move memcg hi...
969
  				   struct mem_cgroup *prev,
694fbc0fe   Andrew Morton   revert "memcg: en...
970
  				   struct mem_cgroup_reclaim_cookie *reclaim)
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
971
  {
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
972
973
  	struct reclaim_iter *uninitialized_var(iter);
  	struct cgroup_subsys_state *css = NULL;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
974
  	struct mem_cgroup *memcg = NULL;
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
975
  	struct mem_cgroup *pos = NULL;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
976

694fbc0fe   Andrew Morton   revert "memcg: en...
977
978
  	if (mem_cgroup_disabled())
  		return NULL;
5660048cc   Johannes Weiner   mm: move memcg hi...
979

9f3a0d093   Johannes Weiner   mm: memcg: consol...
980
981
  	if (!root)
  		root = root_mem_cgroup;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
982

9f3a0d093   Johannes Weiner   mm: memcg: consol...
983
  	if (prev && !reclaim)
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
984
  		pos = prev;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
985

9f3a0d093   Johannes Weiner   mm: memcg: consol...
986
987
  	if (!root->use_hierarchy && root != root_mem_cgroup) {
  		if (prev)
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
988
  			goto out;
694fbc0fe   Andrew Morton   revert "memcg: en...
989
  		return root;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
990
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
991

542f85f9a   Michal Hocko   memcg: rework mem...
992
  	rcu_read_lock();
5f5781619   Michal Hocko   memcg: relax memc...
993

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
994
995
996
997
998
999
1000
1001
1002
1003
  	if (reclaim) {
  		struct mem_cgroup_per_zone *mz;
  
  		mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
  		iter = &mz->iter[reclaim->priority];
  
  		if (prev && reclaim->generation != iter->generation)
  			goto out_unlock;
  
  		do {
4db0c3c29   Jason Low   mm: remove rest o...
1004
  			pos = READ_ONCE(iter->position);
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
  			/*
  			 * A racing update may change the position and
  			 * put the last reference, hence css_tryget(),
  			 * or retry to see the updated position.
  			 */
  		} while (pos && !css_tryget(&pos->css));
  	}
  
  	if (pos)
  		css = &pos->css;
  
  	for (;;) {
  		css = css_next_descendant_pre(css, &root->css);
  		if (!css) {
  			/*
  			 * Reclaimers share the hierarchy walk, and a
  			 * new one might jump in right at the end of
  			 * the hierarchy - make sure they see at least
  			 * one group and restart from the beginning.
  			 */
  			if (!prev)
  				continue;
  			break;
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1028
  		}
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1029

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1030
1031
1032
1033
1034
1035
  		/*
  		 * Verify the css and acquire a reference.  The root
  		 * is provided by the caller, so we know it's alive
  		 * and kicking, and don't take an extra reference.
  		 */
  		memcg = mem_cgroup_from_css(css);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1036

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1037
1038
  		if (css == &root->css)
  			break;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1039

b2052564e   Johannes Weiner   mm: memcontrol: c...
1040
  		if (css_tryget(css)) {
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1041
1042
1043
1044
1045
1046
1047
  			/*
  			 * Make sure the memcg is initialized:
  			 * mem_cgroup_css_online() orders the the
  			 * initialization against setting the flag.
  			 */
  			if (smp_load_acquire(&memcg->initialized))
  				break;
542f85f9a   Michal Hocko   memcg: rework mem...
1048

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1049
  			css_put(css);
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1050
  		}
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1051

5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1052
  		memcg = NULL;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1053
  	}
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
  
  	if (reclaim) {
  		if (cmpxchg(&iter->position, pos, memcg) == pos) {
  			if (memcg)
  				css_get(&memcg->css);
  			if (pos)
  				css_put(&pos->css);
  		}
  
  		/*
  		 * pairs with css_tryget when dereferencing iter->position
  		 * above.
  		 */
  		if (pos)
  			css_put(&pos->css);
  
  		if (!memcg)
  			iter->generation++;
  		else if (!prev)
  			reclaim->generation = iter->generation;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1074
  	}
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1075

542f85f9a   Michal Hocko   memcg: rework mem...
1076
1077
  out_unlock:
  	rcu_read_unlock();
5ac8fb31a   Johannes Weiner   mm: memcontrol: c...
1078
  out:
c40046f3a   Michal Hocko   memcg: keep prev'...
1079
1080
  	if (prev && prev != root)
  		css_put(&prev->css);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1081
  	return memcg;
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1082
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1083

5660048cc   Johannes Weiner   mm: move memcg hi...
1084
1085
1086
1087
1088
1089
1090
  /**
   * mem_cgroup_iter_break - abort a hierarchy walk prematurely
   * @root: hierarchy root
   * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
   */
  void mem_cgroup_iter_break(struct mem_cgroup *root,
  			   struct mem_cgroup *prev)
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1091
1092
1093
1094
1095
1096
  {
  	if (!root)
  		root = root_mem_cgroup;
  	if (prev && prev != root)
  		css_put(&prev->css);
  }
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1097

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1098
1099
1100
1101
1102
1103
  /*
   * Iteration constructs for visiting all cgroups (under a tree).  If
   * loops are exited prematurely (break), mem_cgroup_iter_break() must
   * be used for reference counting.
   */
  #define for_each_mem_cgroup_tree(iter, root)		\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1104
  	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1105
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1106
  	     iter = mem_cgroup_iter(root, iter, NULL))
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
1107

9f3a0d093   Johannes Weiner   mm: memcg: consol...
1108
  #define for_each_mem_cgroup(iter)			\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1109
  	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1110
  	     iter != NULL;				\
527a5ec9a   Johannes Weiner   mm: memcg: per-pr...
1111
  	     iter = mem_cgroup_iter(NULL, iter, NULL))
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
1112

68ae564bb   David Rientjes   mm, memcg: avoid ...
1113
  void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
456f998ec   Ying Han   memcg: add the pa...
1114
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1115
  	struct mem_cgroup *memcg;
456f998ec   Ying Han   memcg: add the pa...
1116

456f998ec   Ying Han   memcg: add the pa...
1117
  	rcu_read_lock();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1118
1119
  	memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
  	if (unlikely(!memcg))
456f998ec   Ying Han   memcg: add the pa...
1120
1121
1122
  		goto out;
  
  	switch (idx) {
456f998ec   Ying Han   memcg: add the pa...
1123
  	case PGFAULT:
0e574a932   Johannes Weiner   mm: memcg: clean ...
1124
1125
1126
1127
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
  		break;
  	case PGMAJFAULT:
  		this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
456f998ec   Ying Han   memcg: add the pa...
1128
1129
1130
1131
1132
1133
1134
  		break;
  	default:
  		BUG();
  	}
  out:
  	rcu_read_unlock();
  }
68ae564bb   David Rientjes   mm, memcg: avoid ...
1135
  EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
456f998ec   Ying Han   memcg: add the pa...
1136

925b7673c   Johannes Weiner   mm: make per-memc...
1137
1138
1139
  /**
   * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
   * @zone: zone of the wanted lruvec
fa9add641   Hugh Dickins   mm/memcg: apply a...
1140
   * @memcg: memcg of the wanted lruvec
925b7673c   Johannes Weiner   mm: make per-memc...
1141
1142
1143
1144
1145
1146
1147
1148
1149
   *
   * Returns the lru list vector holding pages for the given @zone and
   * @mem.  This can be the global zone lruvec, if the memory controller
   * is disabled.
   */
  struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
  				      struct mem_cgroup *memcg)
  {
  	struct mem_cgroup_per_zone *mz;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1150
  	struct lruvec *lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
1151

bea8c150a   Hugh Dickins   memcg: fix hotplu...
1152
1153
1154
1155
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
1156

e231875ba   Jianyu Zhan   mm: memcontrol: c...
1157
  	mz = mem_cgroup_zone_zoneinfo(memcg, zone);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
925b7673c   Johannes Weiner   mm: make per-memc...
1168
  }
925b7673c   Johannes Weiner   mm: make per-memc...
1169
  /**
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
1170
   * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
925b7673c   Johannes Weiner   mm: make per-memc...
1171
   * @page: the page
fa9add641   Hugh Dickins   mm/memcg: apply a...
1172
   * @zone: zone of the page
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
1173
1174
1175
1176
   *
   * This function is only safe when following the LRU page isolation
   * and putback protocol: the LRU lock must be held, and the page must
   * either be PageLRU() or the caller must have isolated/allocated it.
925b7673c   Johannes Weiner   mm: make per-memc...
1177
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1178
  struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1179
  {
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1180
  	struct mem_cgroup_per_zone *mz;
925b7673c   Johannes Weiner   mm: make per-memc...
1181
  	struct mem_cgroup *memcg;
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1182
  	struct lruvec *lruvec;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
1183

bea8c150a   Hugh Dickins   memcg: fix hotplu...
1184
1185
1186
1187
  	if (mem_cgroup_disabled()) {
  		lruvec = &zone->lruvec;
  		goto out;
  	}
925b7673c   Johannes Weiner   mm: make per-memc...
1188

1306a85ae   Johannes Weiner   mm: embed the mem...
1189
  	memcg = page->mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1190
  	/*
dfe0e773d   Johannes Weiner   mm: memcontrol: u...
1191
  	 * Swapcache readahead pages are added to the LRU - and
298333157   Johannes Weiner   mm: memcontrol: r...
1192
  	 * possibly migrated - before they are charged.
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1193
  	 */
298333157   Johannes Weiner   mm: memcontrol: r...
1194
1195
  	if (!memcg)
  		memcg = root_mem_cgroup;
7512102cf   Hugh Dickins   memcg: fix GPF wh...
1196

e231875ba   Jianyu Zhan   mm: memcontrol: c...
1197
  	mz = mem_cgroup_page_zoneinfo(memcg, page);
bea8c150a   Hugh Dickins   memcg: fix hotplu...
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
  	lruvec = &mz->lruvec;
  out:
  	/*
  	 * Since a node can be onlined after the mem_cgroup was created,
  	 * we have to be prepared to initialize lruvec->zone here;
  	 * and if offlined then reonlined, we need to reinitialize it.
  	 */
  	if (unlikely(lruvec->zone != zone))
  		lruvec->zone = zone;
  	return lruvec;
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1208
  }
b69408e88   Christoph Lameter   vmscan: Use an in...
1209

925b7673c   Johannes Weiner   mm: make per-memc...
1210
  /**
fa9add641   Hugh Dickins   mm/memcg: apply a...
1211
1212
1213
1214
   * mem_cgroup_update_lru_size - account for adding or removing an lru page
   * @lruvec: mem_cgroup per zone lru vector
   * @lru: index of lru list the page is sitting on
   * @nr_pages: positive when adding or negative when removing
925b7673c   Johannes Weiner   mm: make per-memc...
1215
   *
fa9add641   Hugh Dickins   mm/memcg: apply a...
1216
1217
   * This function must be called when a page is added to or removed from an
   * lru list.
3f58a8294   Minchan Kim   memcg: move memcg...
1218
   */
fa9add641   Hugh Dickins   mm/memcg: apply a...
1219
1220
  void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
  				int nr_pages)
3f58a8294   Minchan Kim   memcg: move memcg...
1221
1222
  {
  	struct mem_cgroup_per_zone *mz;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1223
  	unsigned long *lru_size;
3f58a8294   Minchan Kim   memcg: move memcg...
1224
1225
1226
  
  	if (mem_cgroup_disabled())
  		return;
fa9add641   Hugh Dickins   mm/memcg: apply a...
1227
1228
1229
1230
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	lru_size = mz->lru_size + lru;
  	*lru_size += nr_pages;
  	VM_BUG_ON((long)(*lru_size) < 0);
08e552c69   KAMEZAWA Hiroyuki   memcg: synchroniz...
1231
  }
544122e5e   KAMEZAWA Hiroyuki   memcg: fix LRU ac...
1232

2314b42db   Johannes Weiner   mm: memcontrol: d...
1233
  bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
3e92041d6   Michal Hocko   memcg: add mem_cg...
1234
  {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1235
  	if (root == memcg)
91c63734f   Johannes Weiner   kernel: cgroup: p...
1236
  		return true;
2314b42db   Johannes Weiner   mm: memcontrol: d...
1237
  	if (!root->use_hierarchy)
91c63734f   Johannes Weiner   kernel: cgroup: p...
1238
  		return false;
2314b42db   Johannes Weiner   mm: memcontrol: d...
1239
  	return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1240
  }
2314b42db   Johannes Weiner   mm: memcontrol: d...
1241
  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
c3ac9a8ad   Johannes Weiner   mm: memcg: count ...
1242
  {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1243
  	struct mem_cgroup *task_memcg;
158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1244
  	struct task_struct *p;
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1245
  	bool ret;
4c4a22148   David Rientjes   memcontrol: move ...
1246

158e0a2d1   KAMEZAWA Hiroyuki   memcg: use find_l...
1247
  	p = find_lock_task_mm(task);
de077d222   David Rientjes   oom, memcg: fix e...
1248
  	if (p) {
2314b42db   Johannes Weiner   mm: memcontrol: d...
1249
  		task_memcg = get_mem_cgroup_from_mm(p->mm);
de077d222   David Rientjes   oom, memcg: fix e...
1250
1251
1252
1253
1254
1255
1256
  		task_unlock(p);
  	} else {
  		/*
  		 * All threads may have already detached their mm's, but the oom
  		 * killer still needs to detect if they have already been oom
  		 * killed to prevent needlessly killing additional tasks.
  		 */
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1257
  		rcu_read_lock();
2314b42db   Johannes Weiner   mm: memcontrol: d...
1258
1259
  		task_memcg = mem_cgroup_from_task(task);
  		css_get(&task_memcg->css);
ffbdccf5e   David Rientjes   mm, memcg: don't ...
1260
  		rcu_read_unlock();
de077d222   David Rientjes   oom, memcg: fix e...
1261
  	}
2314b42db   Johannes Weiner   mm: memcontrol: d...
1262
1263
  	ret = mem_cgroup_is_descendant(task_memcg, memcg);
  	css_put(&task_memcg->css);
4c4a22148   David Rientjes   memcontrol: move ...
1264
1265
  	return ret;
  }
c56d5c7df   Konstantin Khlebnikov   mm/vmscan: push l...
1266
  int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
14797e236   KOSAKI Motohiro   memcg: add inacti...
1267
  {
9b272977e   Johannes Weiner   memcg: skip scann...
1268
  	unsigned long inactive_ratio;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1269
  	unsigned long inactive;
9b272977e   Johannes Weiner   memcg: skip scann...
1270
  	unsigned long active;
c772be939   KOSAKI Motohiro   memcg: fix calcul...
1271
  	unsigned long gb;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1272

4d7dcca21   Hugh Dickins   mm/memcg: get_lru...
1273
1274
  	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
  	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
14797e236   KOSAKI Motohiro   memcg: add inacti...
1275

c772be939   KOSAKI Motohiro   memcg: fix calcul...
1276
1277
1278
1279
1280
  	gb = (inactive + active) >> (30 - PAGE_SHIFT);
  	if (gb)
  		inactive_ratio = int_sqrt(10 * gb);
  	else
  		inactive_ratio = 1;
9b272977e   Johannes Weiner   memcg: skip scann...
1281
  	return inactive * inactive_ratio < active;
14797e236   KOSAKI Motohiro   memcg: add inacti...
1282
  }
90cbc2508   Vladimir Davydov   vmscan: force sca...
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
  bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
  {
  	struct mem_cgroup_per_zone *mz;
  	struct mem_cgroup *memcg;
  
  	if (mem_cgroup_disabled())
  		return true;
  
  	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
  	memcg = mz->memcg;
  
  	return !!(memcg->css.flags & CSS_ONLINE);
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1296
  #define mem_cgroup_from_counter(counter, member)	\
6d61ef409   Balbir Singh   memcg: memory cgr...
1297
  	container_of(counter, struct mem_cgroup, member)
19942822d   Johannes Weiner   memcg: prevent en...
1298
  /**
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1299
   * mem_cgroup_margin - calculate chargeable space of a memory cgroup
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1300
   * @memcg: the memory cgroup
19942822d   Johannes Weiner   memcg: prevent en...
1301
   *
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1302
   * Returns the maximum amount of memory @mem can be charged with, in
7ec99d621   Johannes Weiner   memcg: unify char...
1303
   * pages.
19942822d   Johannes Weiner   memcg: prevent en...
1304
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1305
  static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
19942822d   Johannes Weiner   memcg: prevent en...
1306
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1307
1308
1309
  	unsigned long margin = 0;
  	unsigned long count;
  	unsigned long limit;
9d11ea9f1   Johannes Weiner   memcg: simplify t...
1310

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1311
  	count = page_counter_read(&memcg->memory);
4db0c3c29   Jason Low   mm: remove rest o...
1312
  	limit = READ_ONCE(memcg->memory.limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1313
1314
1315
1316
1317
  	if (count < limit)
  		margin = limit - count;
  
  	if (do_swap_account) {
  		count = page_counter_read(&memcg->memsw);
4db0c3c29   Jason Low   mm: remove rest o...
1318
  		limit = READ_ONCE(memcg->memsw.limit);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1319
1320
1321
1322
1323
  		if (count <= limit)
  			margin = min(margin, limit - count);
  	}
  
  	return margin;
19942822d   Johannes Weiner   memcg: prevent en...
1324
  }
1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
1325
  int mem_cgroup_swappiness(struct mem_cgroup *memcg)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1326
  {
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1327
  	/* root ? */
14208b0ec   Linus Torvalds   Merge branch 'for...
1328
  	if (mem_cgroup_disabled() || !memcg->css.parent)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1329
  		return vm_swappiness;
bf1ff2635   Johannes Weiner   memcg: remove mem...
1330
  	return memcg->swappiness;
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
1331
  }
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
1332
  /*
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1333
   * A routine for checking "mem" is under move_account() or not.
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1334
   *
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1335
1336
1337
   * Checking a cgroup is mc.from or mc.to or under hierarchy of
   * moving cgroups. This is for waiting at high-memory pressure
   * caused by "move".
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1338
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1339
  static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1340
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1341
1342
  	struct mem_cgroup *from;
  	struct mem_cgroup *to;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1343
  	bool ret = false;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1344
1345
1346
1347
1348
1349
1350
1351
1352
  	/*
  	 * Unlike task_move routines, we access mc.to, mc.from not under
  	 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
  	 */
  	spin_lock(&mc.lock);
  	from = mc.from;
  	to = mc.to;
  	if (!from)
  		goto unlock;
3e92041d6   Michal Hocko   memcg: add mem_cg...
1353

2314b42db   Johannes Weiner   mm: memcontrol: d...
1354
1355
  	ret = mem_cgroup_is_descendant(from, memcg) ||
  		mem_cgroup_is_descendant(to, memcg);
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
1356
1357
  unlock:
  	spin_unlock(&mc.lock);
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1358
1359
  	return ret;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1360
  static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1361
1362
  {
  	if (mc.moving_task && current != mc.moving_task) {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1363
  		if (mem_cgroup_under_move(memcg)) {
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
  			DEFINE_WAIT(wait);
  			prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
  			/* moving charge context might have finished. */
  			if (mc.moving_task)
  				schedule();
  			finish_wait(&mc.waitq, &wait);
  			return true;
  		}
  	}
  	return false;
  }
58cf188ed   Sha Zhengju   memcg, oom: provi...
1375
  #define K(x) ((x) << (PAGE_SHIFT-10))
e222432bf   Balbir Singh   memcg: show memcg...
1376
  /**
58cf188ed   Sha Zhengju   memcg, oom: provi...
1377
   * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
e222432bf   Balbir Singh   memcg: show memcg...
1378
1379
1380
1381
1382
1383
1384
1385
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
   * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
   * enabled
   */
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
  {
e61734c55   Tejun Heo   cgroup: remove cg...
1386
  	/* oom_info_lock ensures that parallel ooms do not interleave */
08088cb9a   Michal Hocko   memcg: change oom...
1387
  	static DEFINE_MUTEX(oom_info_lock);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1388
1389
  	struct mem_cgroup *iter;
  	unsigned int i;
e222432bf   Balbir Singh   memcg: show memcg...
1390

08088cb9a   Michal Hocko   memcg: change oom...
1391
  	mutex_lock(&oom_info_lock);
e222432bf   Balbir Singh   memcg: show memcg...
1392
  	rcu_read_lock();
2415b9f5c   Balasubramani Vivekanandan   memcg: print cgro...
1393
1394
1395
1396
1397
1398
1399
  	if (p) {
  		pr_info("Task in ");
  		pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
  		pr_cont(" killed as a result of limit of ");
  	} else {
  		pr_info("Memory limit reached of cgroup ");
  	}
e61734c55   Tejun Heo   cgroup: remove cg...
1400
  	pr_cont_cgroup_path(memcg->css.cgroup);
0346dadbf   Greg Thelen   memcg: remove ext...
1401
1402
  	pr_cont("
  ");
e222432bf   Balbir Singh   memcg: show memcg...
1403

e222432bf   Balbir Singh   memcg: show memcg...
1404
  	rcu_read_unlock();
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
  	pr_info("memory: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->memory)),
  		K((u64)memcg->memory.limit), memcg->memory.failcnt);
  	pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->memsw)),
  		K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
  	pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu
  ",
  		K((u64)page_counter_read(&memcg->kmem)),
  		K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1417
1418
  
  	for_each_mem_cgroup_tree(iter, memcg) {
e61734c55   Tejun Heo   cgroup: remove cg...
1419
1420
  		pr_info("Memory cgroup stats for ");
  		pr_cont_cgroup_path(iter->css.cgroup);
58cf188ed   Sha Zhengju   memcg, oom: provi...
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
  		pr_cont(":");
  
  		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
  			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
  				continue;
  			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
  				K(mem_cgroup_read_stat(iter, i)));
  		}
  
  		for (i = 0; i < NR_LRU_LISTS; i++)
  			pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
  				K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
  
  		pr_cont("
  ");
  	}
08088cb9a   Michal Hocko   memcg: change oom...
1437
  	mutex_unlock(&oom_info_lock);
e222432bf   Balbir Singh   memcg: show memcg...
1438
  }
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1439
1440
1441
1442
  /*
   * This function returns the number of memcg under hierarchy tree. Returns
   * 1(self count) if no children.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1443
  static int mem_cgroup_count_children(struct mem_cgroup *memcg)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1444
1445
  {
  	int num = 0;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1446
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1447
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1448
  		num++;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
1449
1450
  	return num;
  }
6d61ef409   Balbir Singh   memcg: memory cgr...
1451
  /*
a63d83f42   David Rientjes   oom: badness heur...
1452
1453
   * Return the memory (and swap, if configured) limit for a memcg.
   */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1454
  static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
a63d83f42   David Rientjes   oom: badness heur...
1455
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1456
  	unsigned long limit;
f3e8eb70b   Johannes Weiner   memcg: fix unit m...
1457

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1458
  	limit = memcg->memory.limit;
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1459
  	if (mem_cgroup_swappiness(memcg)) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1460
  		unsigned long memsw_limit;
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1461

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1462
1463
  		memsw_limit = memcg->memsw.limit;
  		limit = min(limit + total_swap_pages, memsw_limit);
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1464
  	}
9a5a8f19b   Michal Hocko   memcg: oom: fix t...
1465
  	return limit;
a63d83f42   David Rientjes   oom: badness heur...
1466
  }
19965460e   David Rientjes   mm, memcg: make m...
1467
1468
  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
  				     int order)
9cbb78bb3   David Rientjes   mm, memcg: introd...
1469
1470
1471
1472
1473
1474
  {
  	struct mem_cgroup *iter;
  	unsigned long chosen_points = 0;
  	unsigned long totalpages;
  	unsigned int points = 0;
  	struct task_struct *chosen = NULL;
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1475
  	mutex_lock(&oom_lock);
876aafbfd   David Rientjes   mm, memcg: move a...
1476
  	/*
465adcf1e   David Rientjes   mm, memcg: give e...
1477
1478
1479
  	 * If current has a pending SIGKILL or is exiting, then automatically
  	 * select it.  The goal is to allow it to allocate so that it may
  	 * quickly exit and free its memory.
876aafbfd   David Rientjes   mm, memcg: move a...
1480
  	 */
d003f371b   Oleg Nesterov   oom: don't assume...
1481
  	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
16e951966   Johannes Weiner   mm: oom_kill: cle...
1482
  		mark_oom_victim(current);
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1483
  		goto unlock;
876aafbfd   David Rientjes   mm, memcg: move a...
1484
  	}
2415b9f5c   Balasubramani Vivekanandan   memcg: print cgro...
1485
  	check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1486
  	totalpages = mem_cgroup_get_limit(memcg) ? : 1;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1487
  	for_each_mem_cgroup_tree(iter, memcg) {
72ec70299   Tejun Heo   cgroup: make task...
1488
  		struct css_task_iter it;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1489
  		struct task_struct *task;
72ec70299   Tejun Heo   cgroup: make task...
1490
1491
  		css_task_iter_start(&iter->css, &it);
  		while ((task = css_task_iter_next(&it))) {
9cbb78bb3   David Rientjes   mm, memcg: introd...
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
  			switch (oom_scan_process_thread(task, totalpages, NULL,
  							false)) {
  			case OOM_SCAN_SELECT:
  				if (chosen)
  					put_task_struct(chosen);
  				chosen = task;
  				chosen_points = ULONG_MAX;
  				get_task_struct(chosen);
  				/* fall through */
  			case OOM_SCAN_CONTINUE:
  				continue;
  			case OOM_SCAN_ABORT:
72ec70299   Tejun Heo   cgroup: make task...
1504
  				css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1505
1506
1507
  				mem_cgroup_iter_break(memcg, iter);
  				if (chosen)
  					put_task_struct(chosen);
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1508
  				goto unlock;
9cbb78bb3   David Rientjes   mm, memcg: introd...
1509
1510
1511
1512
  			case OOM_SCAN_OK:
  				break;
  			};
  			points = oom_badness(task, memcg, NULL, totalpages);
d49ad9355   David Rientjes   mm, oom: prefer t...
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
  			if (!points || points < chosen_points)
  				continue;
  			/* Prefer thread group leaders for display purposes */
  			if (points == chosen_points &&
  			    thread_group_leader(chosen))
  				continue;
  
  			if (chosen)
  				put_task_struct(chosen);
  			chosen = task;
  			chosen_points = points;
  			get_task_struct(chosen);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1525
  		}
72ec70299   Tejun Heo   cgroup: make task...
1526
  		css_task_iter_end(&it);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1527
  	}
dc56401fc   Johannes Weiner   mm: oom_kill: sim...
1528
1529
1530
1531
1532
1533
1534
  	if (chosen) {
  		points = chosen_points * 1000 / totalpages;
  		oom_kill_process(chosen, gfp_mask, order, points, totalpages,
  				 memcg, NULL, "Memory cgroup out of memory");
  	}
  unlock:
  	mutex_unlock(&oom_lock);
9cbb78bb3   David Rientjes   mm, memcg: introd...
1535
  }
ae6e71d3d   Michele Curti   mm/memcontrol.c: ...
1536
  #if MAX_NUMNODES > 1
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1537
1538
  /**
   * test_mem_cgroup_node_reclaimable
dad7557eb   Wanpeng Li   mm: fix kernel-do...
1539
   * @memcg: the target memcg
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1540
1541
1542
1543
1544
1545
1546
   * @nid: the node ID to be checked.
   * @noswap : specify true here if the user wants flle only information.
   *
   * This function returns whether the specified memcg contains any
   * reclaimable pages on a node. Returns true if there are any reclaimable
   * pages in the node.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1547
  static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1548
1549
  		int nid, bool noswap)
  {
c0ff4b854   Raghavendra K T   memcg: rename mem...
1550
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1551
1552
1553
  		return true;
  	if (noswap || !total_swap_pages)
  		return false;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1554
  	if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
4d0c066d2   KAMEZAWA Hiroyuki   memcg: fix reclai...
1555
1556
1557
1558
  		return true;
  	return false;
  
  }
889976dbc   Ying Han   memcg: reclaim me...
1559
1560
1561
1562
1563
1564
1565
  
  /*
   * Always updating the nodemask is not very good - even if we have an empty
   * list or the wrong list here, we can start from some node and traverse all
   * nodes based on the zonelist. So update the list loosely once per 10 secs.
   *
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1566
  static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1567
1568
  {
  	int nid;
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1569
1570
1571
1572
  	/*
  	 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
  	 * pagein/pageout changes since the last update.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1573
  	if (!atomic_read(&memcg->numainfo_events))
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1574
  		return;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1575
  	if (atomic_inc_return(&memcg->numainfo_updating) > 1)
889976dbc   Ying Han   memcg: reclaim me...
1576
  		return;
889976dbc   Ying Han   memcg: reclaim me...
1577
  	/* make a nodemask where this memcg uses memory from */
31aaea4aa   Lai Jiangshan   memcontrol: use N...
1578
  	memcg->scan_nodes = node_states[N_MEMORY];
889976dbc   Ying Han   memcg: reclaim me...
1579

31aaea4aa   Lai Jiangshan   memcontrol: use N...
1580
  	for_each_node_mask(nid, node_states[N_MEMORY]) {
889976dbc   Ying Han   memcg: reclaim me...
1581

c0ff4b854   Raghavendra K T   memcg: rename mem...
1582
1583
  		if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
  			node_clear(nid, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1584
  	}
453a9bf34   KAMEZAWA Hiroyuki   memcg: fix numa s...
1585

c0ff4b854   Raghavendra K T   memcg: rename mem...
1586
1587
  	atomic_set(&memcg->numainfo_events, 0);
  	atomic_set(&memcg->numainfo_updating, 0);
889976dbc   Ying Han   memcg: reclaim me...
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
  }
  
  /*
   * Selecting a node where we start reclaim from. Because what we need is just
   * reducing usage counter, start from anywhere is O,K. Considering
   * memory reclaim from current node, there are pros. and cons.
   *
   * Freeing memory from current node means freeing memory from a node which
   * we'll use or we've used. So, it may make LRU bad. And if several threads
   * hit limits, it will see a contention on a node. But freeing from remote
   * node means more costs for memory reclaim because of memory latency.
   *
   * Now, we use round-robin. Better algorithm is welcomed.
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
1602
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1603
1604
  {
  	int node;
c0ff4b854   Raghavendra K T   memcg: rename mem...
1605
1606
  	mem_cgroup_may_update_nodemask(memcg);
  	node = memcg->last_scanned_node;
889976dbc   Ying Han   memcg: reclaim me...
1607

c0ff4b854   Raghavendra K T   memcg: rename mem...
1608
  	node = next_node(node, memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1609
  	if (node == MAX_NUMNODES)
c0ff4b854   Raghavendra K T   memcg: rename mem...
1610
  		node = first_node(memcg->scan_nodes);
889976dbc   Ying Han   memcg: reclaim me...
1611
1612
1613
1614
1615
1616
1617
1618
  	/*
  	 * We call this when we hit limit, not when pages are added to LRU.
  	 * No LRU may hold pages because all pages are UNEVICTABLE or
  	 * memcg is too small and all pages are not on LRU. In that case,
  	 * we use curret node.
  	 */
  	if (unlikely(node == MAX_NUMNODES))
  		node = numa_node_id();
c0ff4b854   Raghavendra K T   memcg: rename mem...
1619
  	memcg->last_scanned_node = node;
889976dbc   Ying Han   memcg: reclaim me...
1620
1621
  	return node;
  }
889976dbc   Ying Han   memcg: reclaim me...
1622
  #else
c0ff4b854   Raghavendra K T   memcg: rename mem...
1623
  int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
889976dbc   Ying Han   memcg: reclaim me...
1624
1625
1626
1627
  {
  	return 0;
  }
  #endif
0608f43da   Andrew Morton   revert "memcg, vm...
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
  static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
  				   struct zone *zone,
  				   gfp_t gfp_mask,
  				   unsigned long *total_scanned)
  {
  	struct mem_cgroup *victim = NULL;
  	int total = 0;
  	int loop = 0;
  	unsigned long excess;
  	unsigned long nr_scanned;
  	struct mem_cgroup_reclaim_cookie reclaim = {
  		.zone = zone,
  		.priority = 0,
  	};
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1642
  	excess = soft_limit_excess(root_memcg);
0608f43da   Andrew Morton   revert "memcg, vm...
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
  
  	while (1) {
  		victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
  		if (!victim) {
  			loop++;
  			if (loop >= 2) {
  				/*
  				 * If we have not been able to reclaim
  				 * anything, it might because there are
  				 * no reclaimable pages under this hierarchy
  				 */
  				if (!total)
  					break;
  				/*
  				 * We want to do more targeted reclaim.
  				 * excess >> 2 is not to excessive so as to
  				 * reclaim too much, nor too less that we keep
  				 * coming back to reclaim from this cgroup
  				 */
  				if (total >= (excess >> 2) ||
  					(loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
  					break;
  			}
  			continue;
  		}
0608f43da   Andrew Morton   revert "memcg, vm...
1668
1669
1670
  		total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
  						     zone, &nr_scanned);
  		*total_scanned += nr_scanned;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
1671
  		if (!soft_limit_excess(root_memcg))
0608f43da   Andrew Morton   revert "memcg, vm...
1672
  			break;
6d61ef409   Balbir Singh   memcg: memory cgr...
1673
  	}
0608f43da   Andrew Morton   revert "memcg, vm...
1674
1675
  	mem_cgroup_iter_break(root_memcg, victim);
  	return total;
6d61ef409   Balbir Singh   memcg: memory cgr...
1676
  }
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1677
1678
1679
1680
1681
  #ifdef CONFIG_LOCKDEP
  static struct lockdep_map memcg_oom_lock_dep_map = {
  	.name = "memcg_oom_lock",
  };
  #endif
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1682
  static DEFINE_SPINLOCK(memcg_oom_lock);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1683
1684
1685
1686
  /*
   * Check OOM-Killer is already running under our hierarchy.
   * If someone is running, return false.
   */
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1687
  static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1688
  {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1689
  	struct mem_cgroup *iter, *failed = NULL;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1690

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1691
  	spin_lock(&memcg_oom_lock);
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1692
  	for_each_mem_cgroup_tree(iter, memcg) {
23751be00   Johannes Weiner   memcg: fix hierar...
1693
  		if (iter->oom_lock) {
79dfdaccd   Michal Hocko   memcg: make oom_l...
1694
1695
1696
1697
  			/*
  			 * this subtree of our hierarchy is already locked
  			 * so we cannot give a lock.
  			 */
79dfdaccd   Michal Hocko   memcg: make oom_l...
1698
  			failed = iter;
9f3a0d093   Johannes Weiner   mm: memcg: consol...
1699
1700
  			mem_cgroup_iter_break(memcg, iter);
  			break;
23751be00   Johannes Weiner   memcg: fix hierar...
1701
1702
  		} else
  			iter->oom_lock = true;
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1703
  	}
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1704

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
  	if (failed) {
  		/*
  		 * OK, we failed to lock the whole subtree so we have
  		 * to clean up what we set up to the failing subtree
  		 */
  		for_each_mem_cgroup_tree(iter, memcg) {
  			if (iter == failed) {
  				mem_cgroup_iter_break(memcg, iter);
  				break;
  			}
  			iter->oom_lock = false;
79dfdaccd   Michal Hocko   memcg: make oom_l...
1716
  		}
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1717
1718
  	} else
  		mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1719
1720
1721
1722
  
  	spin_unlock(&memcg_oom_lock);
  
  	return !failed;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
1723
  }
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1724

fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1725
  static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1726
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
1727
  	struct mem_cgroup *iter;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1728
  	spin_lock(&memcg_oom_lock);
0056f4e66   Johannes Weiner   mm: memcg: lockde...
1729
  	mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1730
  	for_each_mem_cgroup_tree(iter, memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1731
  		iter->oom_lock = false;
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1732
  	spin_unlock(&memcg_oom_lock);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1733
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1734
  static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1735
1736
  {
  	struct mem_cgroup *iter;
c2b42d3ca   Tejun Heo   memcg: convert me...
1737
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1738
  	for_each_mem_cgroup_tree(iter, memcg)
c2b42d3ca   Tejun Heo   memcg: convert me...
1739
1740
  		iter->under_oom++;
  	spin_unlock(&memcg_oom_lock);
79dfdaccd   Michal Hocko   memcg: make oom_l...
1741
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1742
  static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
79dfdaccd   Michal Hocko   memcg: make oom_l...
1743
1744
  {
  	struct mem_cgroup *iter;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1745
1746
  	/*
  	 * When a new child is created while the hierarchy is under oom,
c2b42d3ca   Tejun Heo   memcg: convert me...
1747
  	 * mem_cgroup_oom_lock() may not be called. Watch for underflow.
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1748
  	 */
c2b42d3ca   Tejun Heo   memcg: convert me...
1749
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
1750
  	for_each_mem_cgroup_tree(iter, memcg)
c2b42d3ca   Tejun Heo   memcg: convert me...
1751
1752
1753
  		if (iter->under_oom > 0)
  			iter->under_oom--;
  	spin_unlock(&memcg_oom_lock);
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1754
  }
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1755
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1756
  struct oom_wait_info {
d79154bb5   Hugh Dickins   memcg: replace me...
1757
  	struct mem_cgroup *memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1758
1759
1760
1761
1762
1763
  	wait_queue_t	wait;
  };
  
  static int memcg_oom_wake_function(wait_queue_t *wait,
  	unsigned mode, int sync, void *arg)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
1764
1765
  	struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
  	struct mem_cgroup *oom_wait_memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1766
1767
1768
  	struct oom_wait_info *oom_wait_info;
  
  	oom_wait_info = container_of(wait, struct oom_wait_info, wait);
d79154bb5   Hugh Dickins   memcg: replace me...
1769
  	oom_wait_memcg = oom_wait_info->memcg;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1770

2314b42db   Johannes Weiner   mm: memcontrol: d...
1771
1772
  	if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
  	    !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1773
  		return 0;
dc98df5a1   KAMEZAWA Hiroyuki   memcg: oom wakeup...
1774
1775
  	return autoremove_wake_function(wait, mode, sync, arg);
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
1776
  static void memcg_oom_recover(struct mem_cgroup *memcg)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1777
  {
c2b42d3ca   Tejun Heo   memcg: convert me...
1778
1779
1780
1781
1782
1783
1784
1785
1786
  	/*
  	 * For the following lockless ->under_oom test, the only required
  	 * guarantee is that it must see the state asserted by an OOM when
  	 * this function is called as a result of userland actions
  	 * triggered by the notification of the OOM.  This is trivially
  	 * achieved by invoking mem_cgroup_mark_under_oom() before
  	 * triggering notification.
  	 */
  	if (memcg && memcg->under_oom)
f4b90b70b   Tejun Heo   memcg: remove unu...
1787
  		__wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
1788
  }
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1789
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1790
  {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1791
1792
  	if (!current->memcg_oom.may_oom)
  		return;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1793
  	/*
494264208   Johannes Weiner   mm: memcg: handle...
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
  	 * We are in the middle of the charge context here, so we
  	 * don't want to block when potentially sitting on a callstack
  	 * that holds all kinds of filesystem and mm locks.
  	 *
  	 * Also, the caller may handle a failed allocation gracefully
  	 * (like optional page cache readahead) and so an OOM killer
  	 * invocation might not even be necessary.
  	 *
  	 * That's why we don't do anything here except remember the
  	 * OOM context and then deal with it at the end of the page
  	 * fault when the stack is unwound, the locks are released,
  	 * and when we know whether the fault was overall successful.
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1806
  	 */
494264208   Johannes Weiner   mm: memcg: handle...
1807
1808
1809
1810
  	css_get(&memcg->css);
  	current->memcg_oom.memcg = memcg;
  	current->memcg_oom.gfp_mask = mask;
  	current->memcg_oom.order = order;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1811
1812
1813
1814
  }
  
  /**
   * mem_cgroup_oom_synchronize - complete memcg OOM handling
494264208   Johannes Weiner   mm: memcg: handle...
1815
   * @handle: actually kill/wait or just clean up the OOM state
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1816
   *
494264208   Johannes Weiner   mm: memcg: handle...
1817
1818
   * This has to be called at the end of a page fault if the memcg OOM
   * handler was enabled.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1819
   *
494264208   Johannes Weiner   mm: memcg: handle...
1820
   * Memcg supports userspace OOM handling where failed allocations must
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1821
1822
1823
1824
   * sleep on a waitqueue until the userspace task resolves the
   * situation.  Sleeping directly in the charge context with all kinds
   * of locks held is not a good idea, instead we remember an OOM state
   * in the task and mem_cgroup_oom_synchronize() has to be called at
494264208   Johannes Weiner   mm: memcg: handle...
1825
   * the end of the page fault to complete the OOM handling.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1826
1827
   *
   * Returns %true if an ongoing memcg OOM situation was detected and
494264208   Johannes Weiner   mm: memcg: handle...
1828
   * completed, %false otherwise.
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1829
   */
494264208   Johannes Weiner   mm: memcg: handle...
1830
  bool mem_cgroup_oom_synchronize(bool handle)
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1831
  {
494264208   Johannes Weiner   mm: memcg: handle...
1832
  	struct mem_cgroup *memcg = current->memcg_oom.memcg;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1833
  	struct oom_wait_info owait;
494264208   Johannes Weiner   mm: memcg: handle...
1834
  	bool locked;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1835
1836
  
  	/* OOM is global, do not handle */
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1837
  	if (!memcg)
494264208   Johannes Weiner   mm: memcg: handle...
1838
  		return false;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1839

c32b3cbe0   Michal Hocko   oom, PM: make OOM...
1840
  	if (!handle || oom_killer_disabled)
494264208   Johannes Weiner   mm: memcg: handle...
1841
  		goto cleanup;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1842
1843
1844
1845
1846
1847
  
  	owait.memcg = memcg;
  	owait.wait.flags = 0;
  	owait.wait.func = memcg_oom_wake_function;
  	owait.wait.private = current;
  	INIT_LIST_HEAD(&owait.wait.task_list);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1848

3812c8c8f   Johannes Weiner   mm: memcg: do not...
1849
  	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
494264208   Johannes Weiner   mm: memcg: handle...
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
  	mem_cgroup_mark_under_oom(memcg);
  
  	locked = mem_cgroup_oom_trylock(memcg);
  
  	if (locked)
  		mem_cgroup_oom_notify(memcg);
  
  	if (locked && !memcg->oom_kill_disable) {
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
  		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
  					 current->memcg_oom.order);
  	} else {
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1863
  		schedule();
494264208   Johannes Weiner   mm: memcg: handle...
1864
1865
1866
1867
1868
  		mem_cgroup_unmark_under_oom(memcg);
  		finish_wait(&memcg_oom_waitq, &owait.wait);
  	}
  
  	if (locked) {
fb2a6fc56   Johannes Weiner   mm: memcg: rework...
1869
1870
1871
1872
1873
1874
1875
1876
  		mem_cgroup_oom_unlock(memcg);
  		/*
  		 * There is no guarantee that an OOM-lock contender
  		 * sees the wakeups triggered by the OOM kill
  		 * uncharges.  Wake any sleepers explicitely.
  		 */
  		memcg_oom_recover(memcg);
  	}
494264208   Johannes Weiner   mm: memcg: handle...
1877
1878
  cleanup:
  	current->memcg_oom.memcg = NULL;
3812c8c8f   Johannes Weiner   mm: memcg: do not...
1879
  	css_put(&memcg->css);
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
1880
  	return true;
0b7f569e4   KAMEZAWA Hiroyuki   memcg: fix OOM ki...
1881
  }
d7365e783   Johannes Weiner   mm: memcontrol: f...
1882
1883
1884
  /**
   * mem_cgroup_begin_page_stat - begin a page state statistics transaction
   * @page: page that is going to change accounted state
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1885
   *
d7365e783   Johannes Weiner   mm: memcontrol: f...
1886
1887
1888
   * This function must mark the beginning of an accounted page state
   * change to prevent double accounting when the page is concurrently
   * being moved to another memcg:
32047e2a8   KAMEZAWA Hiroyuki   memcg: avoid lock...
1889
   *
6de226191   Johannes Weiner   mm: memcontrol: t...
1890
   *   memcg = mem_cgroup_begin_page_stat(page);
d7365e783   Johannes Weiner   mm: memcontrol: f...
1891
1892
   *   if (TestClearPageState(page))
   *     mem_cgroup_update_page_stat(memcg, state, -1);
6de226191   Johannes Weiner   mm: memcontrol: t...
1893
   *   mem_cgroup_end_page_stat(memcg);
d69b042f3   Balbir Singh   memcg: add file-b...
1894
   */
6de226191   Johannes Weiner   mm: memcontrol: t...
1895
  struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1896
1897
  {
  	struct mem_cgroup *memcg;
6de226191   Johannes Weiner   mm: memcontrol: t...
1898
  	unsigned long flags;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1899

6de226191   Johannes Weiner   mm: memcontrol: t...
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
  	/*
  	 * The RCU lock is held throughout the transaction.  The fast
  	 * path can get away without acquiring the memcg->move_lock
  	 * because page moving starts with an RCU grace period.
  	 *
  	 * The RCU lock also protects the memcg from being freed when
  	 * the page state that is going to change is the only thing
  	 * preventing the page from being uncharged.
  	 * E.g. end-writeback clearing PageWriteback(), which allows
  	 * migration to go ahead and uncharge the page before the
  	 * account transaction might be complete.
  	 */
d7365e783   Johannes Weiner   mm: memcontrol: f...
1912
1913
1914
1915
  	rcu_read_lock();
  
  	if (mem_cgroup_disabled())
  		return NULL;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1916
  again:
1306a85ae   Johannes Weiner   mm: embed the mem...
1917
  	memcg = page->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
1918
  	if (unlikely(!memcg))
d7365e783   Johannes Weiner   mm: memcontrol: f...
1919
  		return NULL;
bdcbb659f   Qiang Huang   memcg: fold mem_c...
1920
  	if (atomic_read(&memcg->moving_account) <= 0)
d7365e783   Johannes Weiner   mm: memcontrol: f...
1921
  		return memcg;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1922

6de226191   Johannes Weiner   mm: memcontrol: t...
1923
  	spin_lock_irqsave(&memcg->move_lock, flags);
1306a85ae   Johannes Weiner   mm: embed the mem...
1924
  	if (memcg != page->mem_cgroup) {
6de226191   Johannes Weiner   mm: memcontrol: t...
1925
  		spin_unlock_irqrestore(&memcg->move_lock, flags);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1926
1927
  		goto again;
  	}
6de226191   Johannes Weiner   mm: memcontrol: t...
1928
1929
1930
1931
1932
1933
1934
1935
  
  	/*
  	 * When charge migration first begins, we can have locked and
  	 * unlocked page stat updates happening concurrently.  Track
  	 * the task who has the lock for mem_cgroup_end_page_stat().
  	 */
  	memcg->move_lock_task = current;
  	memcg->move_lock_flags = flags;
d7365e783   Johannes Weiner   mm: memcontrol: f...
1936
1937
  
  	return memcg;
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1938
  }
c4843a759   Greg Thelen   memcg: add per cg...
1939
  EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1940

d7365e783   Johannes Weiner   mm: memcontrol: f...
1941
1942
1943
  /**
   * mem_cgroup_end_page_stat - finish a page state statistics transaction
   * @memcg: the memcg that was accounted against
d7365e783   Johannes Weiner   mm: memcontrol: f...
1944
   */
6de226191   Johannes Weiner   mm: memcontrol: t...
1945
  void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1946
  {
6de226191   Johannes Weiner   mm: memcontrol: t...
1947
1948
1949
1950
1951
1952
1953
1954
  	if (memcg && memcg->move_lock_task == current) {
  		unsigned long flags = memcg->move_lock_flags;
  
  		memcg->move_lock_task = NULL;
  		memcg->move_lock_flags = 0;
  
  		spin_unlock_irqrestore(&memcg->move_lock, flags);
  	}
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1955

d7365e783   Johannes Weiner   mm: memcontrol: f...
1956
  	rcu_read_unlock();
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1957
  }
c4843a759   Greg Thelen   memcg: add per cg...
1958
  EXPORT_SYMBOL(mem_cgroup_end_page_stat);
89c06bd52   KAMEZAWA Hiroyuki   memcg: use new lo...
1959

d7365e783   Johannes Weiner   mm: memcontrol: f...
1960
1961
1962
1963
1964
1965
1966
1967
1968
  /**
   * mem_cgroup_update_page_stat - update page state statistics
   * @memcg: memcg to account against
   * @idx: page state item to account
   * @val: number of pages (positive or negative)
   *
   * See mem_cgroup_begin_page_stat() for locking requirements.
   */
  void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
68b4876d9   Sha Zhengju   memcg: remove MEM...
1969
  				 enum mem_cgroup_stat_index idx, int val)
d69b042f3   Balbir Singh   memcg: add file-b...
1970
  {
658b72c5a   Sha Zhengju   memcg: check for ...
1971
  	VM_BUG_ON(!rcu_read_lock_held());
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1972

d7365e783   Johannes Weiner   mm: memcontrol: f...
1973
1974
  	if (memcg)
  		this_cpu_add(memcg->stat->count[idx], val);
d69b042f3   Balbir Singh   memcg: add file-b...
1975
  }
26174efd4   KAMEZAWA Hiroyuki   memcg: generic fi...
1976

f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
1977
  /*
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1978
1979
1980
   * size of first charge trial. "32" comes from vmscan.c's magic value.
   * TODO: maybe necessary to use big numbers in big irons.
   */
7ec99d621   Johannes Weiner   memcg: unify char...
1981
  #define CHARGE_BATCH	32U
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1982
1983
  struct memcg_stock_pcp {
  	struct mem_cgroup *cached; /* this never be root cgroup */
11c9ea4e8   Johannes Weiner   memcg: convert pe...
1984
  	unsigned int nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1985
  	struct work_struct work;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
1986
  	unsigned long flags;
a0db00fcf   Kirill A. Shutemov   memcg: remove red...
1987
  #define FLUSHING_CACHED_CHARGE	0
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1988
1989
  };
  static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
9f50fad65   Michal Hocko   Revert "memcg: ge...
1990
  static DEFINE_MUTEX(percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
1991

a0956d544   Suleiman Souhlal   memcg: make it po...
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
  /**
   * consume_stock: Try to consume stocked charge on this cpu.
   * @memcg: memcg to consume from.
   * @nr_pages: how many pages to charge.
   *
   * The charges will only happen if @memcg matches the current cpu's memcg
   * stock, and at least @nr_pages are available in that stock.  Failure to
   * service an allocation will refill the stock.
   *
   * returns true if successful, false otherwise.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2002
   */
a0956d544   Suleiman Souhlal   memcg: make it po...
2003
  static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2004
2005
  {
  	struct memcg_stock_pcp *stock;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2006
  	bool ret = false;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2007

a0956d544   Suleiman Souhlal   memcg: make it po...
2008
  	if (nr_pages > CHARGE_BATCH)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2009
  		return ret;
a0956d544   Suleiman Souhlal   memcg: make it po...
2010

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2011
  	stock = &get_cpu_var(memcg_stock);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2012
  	if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
a0956d544   Suleiman Souhlal   memcg: make it po...
2013
  		stock->nr_pages -= nr_pages;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2014
2015
  		ret = true;
  	}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2016
2017
2018
2019
2020
  	put_cpu_var(memcg_stock);
  	return ret;
  }
  
  /*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2021
   * Returns stocks cached in percpu and reset cached information.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2022
2023
2024
2025
   */
  static void drain_stock(struct memcg_stock_pcp *stock)
  {
  	struct mem_cgroup *old = stock->cached;
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2026
  	if (stock->nr_pages) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2027
  		page_counter_uncharge(&old->memory, stock->nr_pages);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2028
  		if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2029
  			page_counter_uncharge(&old->memsw, stock->nr_pages);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2030
  		css_put_many(&old->css, stock->nr_pages);
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2031
  		stock->nr_pages = 0;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2032
2033
  	}
  	stock->cached = NULL;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2034
2035
2036
2037
2038
2039
2040
2041
  }
  
  /*
   * This must be called under preempt disabled or must be called by
   * a thread which is pinned to local cpu.
   */
  static void drain_local_stock(struct work_struct *dummy)
  {
7c8e0181e   Christoph Lameter   mm: replace __get...
2042
  	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2043
  	drain_stock(stock);
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2044
  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2045
2046
2047
  }
  
  /*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2048
   * Cache charges(val) to local per_cpu area.
320cc51d9   Greg Thelen   mm: fix typo in r...
2049
   * This will be consumed by consume_stock() function, later.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2050
   */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2051
  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2052
2053
  {
  	struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2054
  	if (stock->cached != memcg) { /* reset if necessary */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2055
  		drain_stock(stock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2056
  		stock->cached = memcg;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2057
  	}
11c9ea4e8   Johannes Weiner   memcg: convert pe...
2058
  	stock->nr_pages += nr_pages;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2059
2060
2061
2062
  	put_cpu_var(memcg_stock);
  }
  
  /*
c0ff4b854   Raghavendra K T   memcg: rename mem...
2063
   * Drains all per-CPU charge caches for given root_memcg resp. subtree
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2064
   * of the hierarchy under it.
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2065
   */
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2066
  static void drain_all_stock(struct mem_cgroup *root_memcg)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2067
  {
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2068
  	int cpu, curcpu;
d38144b7a   Michal Hocko   memcg: unify sync...
2069

6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2070
2071
2072
  	/* If someone's already draining, avoid adding running more workers. */
  	if (!mutex_trylock(&percpu_charge_mutex))
  		return;
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2073
  	/* Notify other cpus that system-wide "drain" is running */
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2074
  	get_online_cpus();
5af12d0ef   Johannes Weiner   memcg: pin execut...
2075
  	curcpu = get_cpu();
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2076
2077
  	for_each_online_cpu(cpu) {
  		struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
c0ff4b854   Raghavendra K T   memcg: rename mem...
2078
  		struct mem_cgroup *memcg;
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2079

c0ff4b854   Raghavendra K T   memcg: rename mem...
2080
2081
  		memcg = stock->cached;
  		if (!memcg || !stock->nr_pages)
26fe61684   KAMEZAWA Hiroyuki   memcg: fix percpu...
2082
  			continue;
2314b42db   Johannes Weiner   mm: memcontrol: d...
2083
  		if (!mem_cgroup_is_descendant(memcg, root_memcg))
3e92041d6   Michal Hocko   memcg: add mem_cg...
2084
  			continue;
d1a05b697   Michal Hocko   memcg: do not try...
2085
2086
2087
2088
2089
2090
  		if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
  			if (cpu == curcpu)
  				drain_local_stock(&stock->work);
  			else
  				schedule_work_on(cpu, &stock->work);
  		}
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2091
  	}
5af12d0ef   Johannes Weiner   memcg: pin execut...
2092
  	put_cpu();
f894ffa86   Andrew Morton   memcg: trivial cl...
2093
  	put_online_cpus();
9f50fad65   Michal Hocko   Revert "memcg: ge...
2094
  	mutex_unlock(&percpu_charge_mutex);
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2095
  }
0db0628d9   Paul Gortmaker   kernel: delete __...
2096
  static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2097
2098
2099
2100
2101
  					unsigned long action,
  					void *hcpu)
  {
  	int cpu = (unsigned long)hcpu;
  	struct memcg_stock_pcp *stock;
619d094b5   KAMEZAWA Hiroyuki   memcg: simplify m...
2102
  	if (action == CPU_ONLINE)
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2103
  		return NOTIFY_OK;
1489ebad8   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2104

d833049bd   Kirill A. Shutemov   memcg: fix broken...
2105
  	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2106
  		return NOTIFY_OK;
711d3d2c9   KAMEZAWA Hiroyuki   memcg: cpu hotplu...
2107

cdec2e426   KAMEZAWA Hiroyuki   memcg: coalesce c...
2108
2109
2110
2111
  	stock = &per_cpu(memcg_stock, cpu);
  	drain_stock(stock);
  	return NOTIFY_OK;
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
2112
2113
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
  		      unsigned int nr_pages)
8a9f3ccd2   Balbir Singh   Memory controller...
2114
  {
7ec99d621   Johannes Weiner   memcg: unify char...
2115
  	unsigned int batch = max(CHARGE_BATCH, nr_pages);
9b1306192   Johannes Weiner   mm: memcontrol: r...
2116
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2117
  	struct mem_cgroup *mem_over_limit;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2118
  	struct page_counter *counter;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2119
  	unsigned long nr_reclaimed;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2120
2121
  	bool may_swap = true;
  	bool drained = false;
05b843012   Johannes Weiner   mm: memcontrol: u...
2122
  	int ret = 0;
a636b327f   KAMEZAWA Hiroyuki   memcg: avoid unne...
2123

ce00a9673   Johannes Weiner   mm: memcontrol: r...
2124
2125
  	if (mem_cgroup_is_root(memcg))
  		goto done;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2126
  retry:
b6b6cc72b   Michal Hocko   memcg: do not rep...
2127
2128
  	if (consume_stock(memcg, nr_pages))
  		goto done;
8a9f3ccd2   Balbir Singh   Memory controller...
2129

3fbe72442   Johannes Weiner   mm: memcontrol: s...
2130
  	if (!do_swap_account ||
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2131
2132
  	    !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
  		if (!page_counter_try_charge(&memcg->memory, batch, &counter))
6539cc053   Johannes Weiner   mm: memcontrol: f...
2133
  			goto done_restock;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2134
  		if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2135
2136
  			page_counter_uncharge(&memcg->memsw, batch);
  		mem_over_limit = mem_cgroup_from_counter(counter, memory);
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2137
  	} else {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2138
  		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2139
  		may_swap = false;
3fbe72442   Johannes Weiner   mm: memcontrol: s...
2140
  	}
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2141

6539cc053   Johannes Weiner   mm: memcontrol: f...
2142
2143
2144
2145
  	if (batch > nr_pages) {
  		batch = nr_pages;
  		goto retry;
  	}
6d61ef409   Balbir Singh   memcg: memory cgr...
2146

06b078fc0   Johannes Weiner   mm: memcontrol: r...
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
  	/*
  	 * Unlike in global OOM situations, memcg is not in a physical
  	 * memory shortage.  Allow dying and OOM-killed tasks to
  	 * bypass the last charges so that they can exit quickly and
  	 * free their memory.
  	 */
  	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
  		     fatal_signal_pending(current) ||
  		     current->flags & PF_EXITING))
  		goto bypass;
  
  	if (unlikely(task_in_memcg_oom(current)))
  		goto nomem;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2160
2161
  	if (!(gfp_mask & __GFP_WAIT))
  		goto nomem;
4b5343346   KAMEZAWA Hiroyuki   memcg: clean up t...
2162

241994ed8   Johannes Weiner   mm: memcontrol: d...
2163
  	mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2164
2165
  	nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
  						    gfp_mask, may_swap);
6539cc053   Johannes Weiner   mm: memcontrol: f...
2166

61e02c745   Johannes Weiner   mm: memcontrol: c...
2167
  	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
6539cc053   Johannes Weiner   mm: memcontrol: f...
2168
  		goto retry;
28c34c291   Johannes Weiner   mm: memcontrol: r...
2169

b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2170
  	if (!drained) {
6d3d6aa22   Johannes Weiner   mm: memcontrol: r...
2171
  		drain_all_stock(mem_over_limit);
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2172
2173
2174
  		drained = true;
  		goto retry;
  	}
28c34c291   Johannes Weiner   mm: memcontrol: r...
2175
2176
  	if (gfp_mask & __GFP_NORETRY)
  		goto nomem;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2177
2178
2179
2180
2181
2182
2183
2184
2185
  	/*
  	 * Even though the limit is exceeded at this point, reclaim
  	 * may have been able to free some pages.  Retry the charge
  	 * before killing the task.
  	 *
  	 * Only for regular pages, though: huge pages are rather
  	 * unlikely to succeed so close to the limit, and we fall back
  	 * to regular pages anyway in case of failure.
  	 */
61e02c745   Johannes Weiner   mm: memcontrol: c...
2186
  	if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
6539cc053   Johannes Weiner   mm: memcontrol: f...
2187
2188
2189
2190
2191
2192
2193
  		goto retry;
  	/*
  	 * At task move, charge accounts can be doubly counted. So, it's
  	 * better to wait until the end of task_move if something is going on.
  	 */
  	if (mem_cgroup_wait_acct_move(mem_over_limit))
  		goto retry;
9b1306192   Johannes Weiner   mm: memcontrol: r...
2194
2195
  	if (nr_retries--)
  		goto retry;
06b078fc0   Johannes Weiner   mm: memcontrol: r...
2196
2197
  	if (gfp_mask & __GFP_NOFAIL)
  		goto bypass;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2198
2199
  	if (fatal_signal_pending(current))
  		goto bypass;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2200
  	mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
61e02c745   Johannes Weiner   mm: memcontrol: c...
2201
  	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2202
  nomem:
6d1fdc489   Johannes Weiner   memcg: sanitize _...
2203
  	if (!(gfp_mask & __GFP_NOFAIL))
3168ecbe1   Johannes Weiner   mm: memcg: use pr...
2204
  		return -ENOMEM;
867578cbc   KAMEZAWA Hiroyuki   memcg: fix oom ki...
2205
  bypass:
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2206
  	return -EINTR;
6539cc053   Johannes Weiner   mm: memcontrol: f...
2207
2208
  
  done_restock:
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2209
  	css_get_many(&memcg->css, batch);
6539cc053   Johannes Weiner   mm: memcontrol: f...
2210
2211
  	if (batch > nr_pages)
  		refill_stock(memcg, batch - nr_pages);
7d638093d   Vladimir Davydov   memcg: do not cal...
2212
2213
  	if (!(gfp_mask & __GFP_WAIT))
  		goto done;
241994ed8   Johannes Weiner   mm: memcontrol: d...
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
  	/*
  	 * If the hierarchy is above the normal consumption range,
  	 * make the charging task trim their excess contribution.
  	 */
  	do {
  		if (page_counter_read(&memcg->memory) <= memcg->high)
  			continue;
  		mem_cgroup_events(memcg, MEMCG_HIGH, 1);
  		try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
  	} while ((memcg = parent_mem_cgroup(memcg)));
6539cc053   Johannes Weiner   mm: memcontrol: f...
2224
  done:
05b843012   Johannes Weiner   mm: memcontrol: u...
2225
  	return ret;
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2226
  }
8a9f3ccd2   Balbir Singh   Memory controller...
2227

00501b531   Johannes Weiner   mm: memcontrol: r...
2228
  static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
a3032a2c1   Daisuke Nishimura   memcg: add mem_cg...
2229
  {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2230
2231
  	if (mem_cgroup_is_root(memcg))
  		return;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2232
  	page_counter_uncharge(&memcg->memory, nr_pages);
05b843012   Johannes Weiner   mm: memcontrol: u...
2233
  	if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2234
  		page_counter_uncharge(&memcg->memsw, nr_pages);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2235

e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2236
  	css_put_many(&memcg->css, nr_pages);
d01dd17f1   KAMEZAWA Hiroyuki   memcg: use res_co...
2237
2238
2239
  }
  
  /*
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2240
2241
2242
2243
2244
2245
2246
2247
2248
   * try_get_mem_cgroup_from_page - look up page's memcg association
   * @page: the page
   *
   * Look up, get a css reference, and return the memcg that owns @page.
   *
   * The page must be locked to prevent racing with swap-in and page
   * cache charges.  If coming from an unlocked page table, the caller
   * must ensure the page is on the LRU or this can race with charging.
   */
e42d9d5d4   Wu Fengguang   memcg: rename and...
2249
  struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2250
  {
298333157   Johannes Weiner   mm: memcontrol: r...
2251
  	struct mem_cgroup *memcg;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2252
  	unsigned short id;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2253
  	swp_entry_t ent;
309381fea   Sasha Levin   mm: dump page whe...
2254
  	VM_BUG_ON_PAGE(!PageLocked(page), page);
3c776e646   Daisuke Nishimura   memcg: charge swa...
2255

1306a85ae   Johannes Weiner   mm: embed the mem...
2256
  	memcg = page->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
2257
2258
  	if (memcg) {
  		if (!css_tryget_online(&memcg->css))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2259
  			memcg = NULL;
e42d9d5d4   Wu Fengguang   memcg: rename and...
2260
  	} else if (PageSwapCache(page)) {
3c776e646   Daisuke Nishimura   memcg: charge swa...
2261
  		ent.val = page_private(page);
9fb4b7cc0   Bob Liu   page_cgroup: add ...
2262
  		id = lookup_swap_cgroup_id(ent);
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2263
  		rcu_read_lock();
adbe427b9   Vladimir Davydov   memcg: zap mem_cg...
2264
  		memcg = mem_cgroup_from_id(id);
ec903c0c8   Tejun Heo   cgroup: rename cs...
2265
  		if (memcg && !css_tryget_online(&memcg->css))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2266
  			memcg = NULL;
a3b2d6926   KAMEZAWA Hiroyuki   cgroups: use css ...
2267
  		rcu_read_unlock();
3c776e646   Daisuke Nishimura   memcg: charge swa...
2268
  	}
c0ff4b854   Raghavendra K T   memcg: rename mem...
2269
  	return memcg;
b5a84319a   KAMEZAWA Hiroyuki   memcg: fix shmem'...
2270
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
  static void lock_page_lru(struct page *page, int *isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	spin_lock_irq(&zone->lru_lock);
  	if (PageLRU(page)) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		ClearPageLRU(page);
  		del_page_from_lru_list(page, lruvec, page_lru(page));
  		*isolated = 1;
  	} else
  		*isolated = 0;
  }
  
  static void unlock_page_lru(struct page *page, int isolated)
  {
  	struct zone *zone = page_zone(page);
  
  	if (isolated) {
  		struct lruvec *lruvec;
  
  		lruvec = mem_cgroup_page_lruvec(page, zone);
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		SetPageLRU(page);
  		add_page_to_lru_list(page, lruvec, page_lru(page));
  	}
  	spin_unlock_irq(&zone->lru_lock);
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
2301
  static void commit_charge(struct page *page, struct mem_cgroup *memcg,
6abb5a867   Johannes Weiner   mm: memcontrol: a...
2302
  			  bool lrucare)
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2303
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2304
  	int isolated;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2305

1306a85ae   Johannes Weiner   mm: embed the mem...
2306
  	VM_BUG_ON_PAGE(page->mem_cgroup, page);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2307
2308
2309
2310
2311
  
  	/*
  	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
  	 * may already be on some other mem_cgroup's LRU.  Take care of it.
  	 */
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2312
2313
  	if (lrucare)
  		lock_page_lru(page, &isolated);
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2314

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2315
2316
  	/*
  	 * Nobody should be changing or seriously looking at
1306a85ae   Johannes Weiner   mm: embed the mem...
2317
  	 * page->mem_cgroup at this point:
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
  	 *
  	 * - the page is uncharged
  	 *
  	 * - the page is off-LRU
  	 *
  	 * - an anonymous fault has exclusive page access, except for
  	 *   a locked page table
  	 *
  	 * - a page cache insertion, a swapin fault, or a migration
  	 *   have the page locked
  	 */
1306a85ae   Johannes Weiner   mm: embed the mem...
2329
  	page->mem_cgroup = memcg;
9ce70c024   Hugh Dickins   memcg: fix deadlo...
2330

0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2331
2332
  	if (lrucare)
  		unlock_page_lru(page, isolated);
7a81b88cb   KAMEZAWA Hiroyuki   memcg: introduce ...
2333
  }
66e1707bc   Balbir Singh   Memory controller...
2334

7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2335
  #ifdef CONFIG_MEMCG_KMEM
dbf22eb6d   Vladimir Davydov   memcg: zap __memc...
2336
2337
  int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
  		      unsigned long nr_pages)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2338
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2339
  	struct page_counter *counter;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2340
  	int ret = 0;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2341

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2342
2343
  	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
  	if (ret < 0)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2344
  		return ret;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2345
  	ret = try_charge(memcg, gfp, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2346
2347
  	if (ret == -EINTR)  {
  		/*
00501b531   Johannes Weiner   mm: memcontrol: r...
2348
2349
2350
2351
2352
2353
  		 * try_charge() chose to bypass to root due to OOM kill or
  		 * fatal signal.  Since our only options are to either fail
  		 * the allocation or charge it to this cgroup, do it as a
  		 * temporary condition. But we can't fail. From a kmem/slab
  		 * perspective, the cache has already been selected, by
  		 * mem_cgroup_kmem_get_cache(), so it is too late to change
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2354
2355
2356
  		 * our minds.
  		 *
  		 * This condition will only trigger if the task entered
00501b531   Johannes Weiner   mm: memcontrol: r...
2357
2358
2359
  		 * memcg_charge_kmem in a sane state, but was OOM-killed
  		 * during try_charge() above. Tasks that were already dying
  		 * when the allocation triggers should have been already
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2360
2361
  		 * directed to the root cgroup in memcontrol.h
  		 */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2362
  		page_counter_charge(&memcg->memory, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2363
  		if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2364
  			page_counter_charge(&memcg->memsw, nr_pages);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2365
  		css_get_many(&memcg->css, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2366
2367
  		ret = 0;
  	} else if (ret)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2368
  		page_counter_uncharge(&memcg->kmem, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2369
2370
2371
  
  	return ret;
  }
dbf22eb6d   Vladimir Davydov   memcg: zap __memc...
2372
  void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2373
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2374
  	page_counter_uncharge(&memcg->memory, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2375
  	if (do_swap_account)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2376
  		page_counter_uncharge(&memcg->memsw, nr_pages);
7de37682b   Glauber Costa   memcg: kmem accou...
2377

64f219938   Johannes Weiner   mm: memcontrol: r...
2378
  	page_counter_uncharge(&memcg->kmem, nr_pages);
7de37682b   Glauber Costa   memcg: kmem accou...
2379

e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
2380
  	css_put_many(&memcg->css, nr_pages);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2381
  }
2633d7a02   Glauber Costa   slab/slub: consid...
2382
2383
2384
2385
2386
2387
2388
2389
2390
  /*
   * helper for acessing a memcg's index. It will be used as an index in the
   * child cache array in kmem_cache, and also to derive its name. This function
   * will return -1 when this is not a kmem-limited memcg.
   */
  int memcg_cache_id(struct mem_cgroup *memcg)
  {
  	return memcg ? memcg->kmemcg_id : -1;
  }
f3bb3043a   Vladimir Davydov   memcg: don't call...
2391
  static int memcg_alloc_cache_id(void)
55007d849   Glauber Costa   memcg: allocate m...
2392
  {
f3bb3043a   Vladimir Davydov   memcg: don't call...
2393
2394
  	int id, size;
  	int err;
dbcf73e26   Vladimir Davydov   memcg: rename som...
2395
  	id = ida_simple_get(&memcg_cache_ida,
f3bb3043a   Vladimir Davydov   memcg: don't call...
2396
2397
2398
  			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
  	if (id < 0)
  		return id;
55007d849   Glauber Costa   memcg: allocate m...
2399

dbcf73e26   Vladimir Davydov   memcg: rename som...
2400
  	if (id < memcg_nr_cache_ids)
f3bb3043a   Vladimir Davydov   memcg: don't call...
2401
2402
2403
2404
2405
2406
  		return id;
  
  	/*
  	 * There's no space for the new id in memcg_caches arrays,
  	 * so we have to grow them.
  	 */
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2407
  	down_write(&memcg_cache_ids_sem);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2408
2409
  
  	size = 2 * (id + 1);
55007d849   Glauber Costa   memcg: allocate m...
2410
2411
2412
2413
  	if (size < MEMCG_CACHES_MIN_SIZE)
  		size = MEMCG_CACHES_MIN_SIZE;
  	else if (size > MEMCG_CACHES_MAX_SIZE)
  		size = MEMCG_CACHES_MAX_SIZE;
f3bb3043a   Vladimir Davydov   memcg: don't call...
2414
  	err = memcg_update_all_caches(size);
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2415
  	if (!err)
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2416
2417
  		err = memcg_update_all_list_lrus(size);
  	if (!err)
05257a1a3   Vladimir Davydov   memcg: add rwsem ...
2418
2419
2420
  		memcg_nr_cache_ids = size;
  
  	up_write(&memcg_cache_ids_sem);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2421
  	if (err) {
dbcf73e26   Vladimir Davydov   memcg: rename som...
2422
  		ida_simple_remove(&memcg_cache_ida, id);
f3bb3043a   Vladimir Davydov   memcg: don't call...
2423
2424
2425
2426
2427
2428
2429
  		return err;
  	}
  	return id;
  }
  
  static void memcg_free_cache_id(int id)
  {
dbcf73e26   Vladimir Davydov   memcg: rename som...
2430
  	ida_simple_remove(&memcg_cache_ida, id);
55007d849   Glauber Costa   memcg: allocate m...
2431
  }
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2432
  struct memcg_kmem_cache_create_work {
5722d094a   Vladimir Davydov   memcg, slab: clea...
2433
2434
2435
2436
  	struct mem_cgroup *memcg;
  	struct kmem_cache *cachep;
  	struct work_struct work;
  };
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2437
  static void memcg_kmem_cache_create_func(struct work_struct *w)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2438
  {
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2439
2440
  	struct memcg_kmem_cache_create_work *cw =
  		container_of(w, struct memcg_kmem_cache_create_work, work);
5722d094a   Vladimir Davydov   memcg, slab: clea...
2441
2442
  	struct mem_cgroup *memcg = cw->memcg;
  	struct kmem_cache *cachep = cw->cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2443

d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2444
  	memcg_create_kmem_cache(memcg, cachep);
bd6731458   Vladimir Davydov   memcg, slab: simp...
2445

5722d094a   Vladimir Davydov   memcg, slab: clea...
2446
  	css_put(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2447
2448
2449
2450
2451
  	kfree(cw);
  }
  
  /*
   * Enqueue the creation of a per-memcg kmem_cache.
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2452
   */
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2453
2454
  static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  					       struct kmem_cache *cachep)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2455
  {
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2456
  	struct memcg_kmem_cache_create_work *cw;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2457

776ed0f03   Vladimir Davydov   memcg: cleanup km...
2458
  	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2459
  	if (!cw)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2460
  		return;
8135be5a8   Vladimir Davydov   memcg: fix possib...
2461
2462
  
  	css_get(&memcg->css);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2463
2464
2465
  
  	cw->memcg = memcg;
  	cw->cachep = cachep;
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2466
  	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2467

d7f25f8a2   Glauber Costa   memcg: infrastruc...
2468
2469
  	schedule_work(&cw->work);
  }
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2470
2471
  static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  					     struct kmem_cache *cachep)
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2472
2473
2474
2475
  {
  	/*
  	 * We need to stop accounting when we kmalloc, because if the
  	 * corresponding kmalloc cache is not yet created, the first allocation
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2476
  	 * in __memcg_schedule_kmem_cache_create will recurse.
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2477
2478
2479
2480
2481
2482
2483
  	 *
  	 * However, it is better to enclose the whole function. Depending on
  	 * the debugging options enabled, INIT_WORK(), for instance, can
  	 * trigger an allocation. This too, will make us recurse. Because at
  	 * this point we can't allow ourselves back into memcg_kmem_get_cache,
  	 * the safest choice is to do it like this, wrapping the whole function.
  	 */
6f185c290   Vladimir Davydov   memcg: turn memcg...
2484
  	current->memcg_kmem_skip_account = 1;
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2485
  	__memcg_schedule_kmem_cache_create(memcg, cachep);
6f185c290   Vladimir Davydov   memcg: turn memcg...
2486
  	current->memcg_kmem_skip_account = 0;
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2487
  }
c67a8a685   Vladimir Davydov   memcg, slab: merg...
2488

d7f25f8a2   Glauber Costa   memcg: infrastruc...
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
  /*
   * Return the kmem_cache we're supposed to use for a slab allocation.
   * We try to use the current memcg's version of the cache.
   *
   * If the cache does not exist yet, if we are the first user of it,
   * we either create it immediately, if possible, or create it asynchronously
   * in a workqueue.
   * In the latter case, we will let the current allocation go through with
   * the original cache.
   *
   * Can't be called in interrupt context or from kernel threads.
   * This function needs to be called with rcu_read_lock() held.
   */
056b7ccef   Zhang Zhen   mm/memcontrol.c: ...
2502
  struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2503
2504
  {
  	struct mem_cgroup *memcg;
959c8963f   Vladimir Davydov   memcg, slab: fix ...
2505
  	struct kmem_cache *memcg_cachep;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2506
  	int kmemcg_id;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2507

f7ce3190c   Vladimir Davydov   slab: embed memcg...
2508
  	VM_BUG_ON(!is_root_cache(cachep));
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2509

9d100c5e4   Vladimir Davydov   memcg: don't chec...
2510
  	if (current->memcg_kmem_skip_account)
0e9d92f2d   Glauber Costa   memcg: skip memcg...
2511
  		return cachep;
8135be5a8   Vladimir Davydov   memcg: fix possib...
2512
  	memcg = get_mem_cgroup_from_mm(current->mm);
4db0c3c29   Jason Low   mm: remove rest o...
2513
  	kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2514
  	if (kmemcg_id < 0)
ca0dde971   Li Zefan   memcg: take refer...
2515
  		goto out;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2516

2a4db7eb9   Vladimir Davydov   memcg: free memcg...
2517
  	memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2518
2519
  	if (likely(memcg_cachep))
  		return memcg_cachep;
ca0dde971   Li Zefan   memcg: take refer...
2520
2521
2522
2523
2524
2525
2526
2527
2528
  
  	/*
  	 * If we are in a safe context (can wait, and not in interrupt
  	 * context), we could be be predictable and return right away.
  	 * This would guarantee that the allocation being performed
  	 * already belongs in the new cache.
  	 *
  	 * However, there are some clashes that can arrive from locking.
  	 * For instance, because we acquire the slab_mutex while doing
776ed0f03   Vladimir Davydov   memcg: cleanup km...
2529
2530
2531
  	 * memcg_create_kmem_cache, this means no further allocation
  	 * could happen with the slab_mutex held. So it's better to
  	 * defer everything.
ca0dde971   Li Zefan   memcg: take refer...
2532
  	 */
d5b3cf713   Vladimir Davydov   memcg: zap memcg_...
2533
  	memcg_schedule_kmem_cache_create(memcg, cachep);
ca0dde971   Li Zefan   memcg: take refer...
2534
  out:
8135be5a8   Vladimir Davydov   memcg: fix possib...
2535
  	css_put(&memcg->css);
ca0dde971   Li Zefan   memcg: take refer...
2536
  	return cachep;
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2537
  }
d7f25f8a2   Glauber Costa   memcg: infrastruc...
2538

8135be5a8   Vladimir Davydov   memcg: fix possib...
2539
2540
2541
  void __memcg_kmem_put_cache(struct kmem_cache *cachep)
  {
  	if (!is_root_cache(cachep))
f7ce3190c   Vladimir Davydov   slab: embed memcg...
2542
  		css_put(&cachep->memcg_params.memcg->css);
8135be5a8   Vladimir Davydov   memcg: fix possib...
2543
  }
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
  /*
   * We need to verify if the allocation against current->mm->owner's memcg is
   * possible for the given order. But the page is not allocated yet, so we'll
   * need a further commit step to do the final arrangements.
   *
   * It is possible for the task to switch cgroups in this mean time, so at
   * commit time, we can't rely on task conversion any longer.  We'll then use
   * the handle argument to return to the caller which cgroup we should commit
   * against. We could also return the memcg directly and avoid the pointer
   * passing, but a boolean return value gives better semantics considering
   * the compiled-out case as well.
   *
   * Returning true means the allocation is possible.
   */
  bool
  __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
  {
  	struct mem_cgroup *memcg;
  	int ret;
  
  	*_memcg = NULL;
6d42c232b   Glauber Costa   memcg: also test ...
2565

df3819754   Johannes Weiner   memcg: get_mem_cg...
2566
  	memcg = get_mem_cgroup_from_mm(current->mm);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2567

cf2b8fbf1   Vladimir Davydov   memcg: zap memcg_...
2568
  	if (!memcg_kmem_is_active(memcg)) {
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2569
2570
2571
  		css_put(&memcg->css);
  		return true;
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2572
  	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2573
2574
  	if (!ret)
  		*_memcg = memcg;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2575
2576
2577
2578
2579
2580
2581
2582
  
  	css_put(&memcg->css);
  	return (ret == 0);
  }
  
  void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
  			      int order)
  {
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2583
2584
2585
2586
  	VM_BUG_ON(mem_cgroup_is_root(memcg));
  
  	/* The page allocation failed. Revert */
  	if (!page) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2587
  		memcg_uncharge_kmem(memcg, 1 << order);
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2588
2589
  		return;
  	}
1306a85ae   Johannes Weiner   mm: embed the mem...
2590
  	page->mem_cgroup = memcg;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2591
2592
2593
2594
  }
  
  void __memcg_kmem_uncharge_pages(struct page *page, int order)
  {
1306a85ae   Johannes Weiner   mm: embed the mem...
2595
  	struct mem_cgroup *memcg = page->mem_cgroup;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2596

7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2597
2598
  	if (!memcg)
  		return;
309381fea   Sasha Levin   mm: dump page whe...
2599
  	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
298333157   Johannes Weiner   mm: memcontrol: r...
2600

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2601
  	memcg_uncharge_kmem(memcg, 1 << order);
1306a85ae   Johannes Weiner   mm: embed the mem...
2602
  	page->mem_cgroup = NULL;
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2603
  }
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
  
  struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
  {
  	struct mem_cgroup *memcg = NULL;
  	struct kmem_cache *cachep;
  	struct page *page;
  
  	page = virt_to_head_page(ptr);
  	if (PageSlab(page)) {
  		cachep = page->slab_cache;
  		if (!is_root_cache(cachep))
f7ce3190c   Vladimir Davydov   slab: embed memcg...
2615
  			memcg = cachep->memcg_params.memcg;
60d3fd32a   Vladimir Davydov   list_lru: introdu...
2616
2617
2618
2619
2620
2621
  	} else
  		/* page allocated by alloc_kmem_pages */
  		memcg = page->mem_cgroup;
  
  	return memcg;
  }
7ae1e1d0f   Glauber Costa   memcg: kmem contr...
2622
  #endif /* CONFIG_MEMCG_KMEM */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2623
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2624
2625
  /*
   * Because tail pages are not marked as "used", set it. We're under
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2626
2627
2628
   * zone->lru_lock, 'splitting on pmd' and compound_lock.
   * charge/uncharge will be never happen and move_account() is done under
   * compound_lock(), so we don't have to take care of races.
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2629
   */
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2630
  void mem_cgroup_split_huge_fixup(struct page *head)
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2631
  {
e94c8a9cb   KAMEZAWA Hiroyuki   memcg: make mem_c...
2632
  	int i;
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2633

3d37c4a91   KAMEZAWA Hiroyuki   memcg: bugfix che...
2634
2635
  	if (mem_cgroup_disabled())
  		return;
b070e65c0   David Rientjes   mm, memcg: add rs...
2636

298333157   Johannes Weiner   mm: memcontrol: r...
2637
  	for (i = 1; i < HPAGE_PMD_NR; i++)
1306a85ae   Johannes Weiner   mm: embed the mem...
2638
  		head[i].mem_cgroup = head->mem_cgroup;
b9982f8d2   Michal Hocko   mm: memcontrol: m...
2639

1306a85ae   Johannes Weiner   mm: embed the mem...
2640
  	__this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
b070e65c0   David Rientjes   mm, memcg: add rs...
2641
  		       HPAGE_PMD_NR);
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2642
  }
12d271078   Hugh Dickins   memcg: fix split_...
2643
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
ca3e02141   KAMEZAWA Hiroyuki   memcg: fix USED b...
2644

c255a4580   Andrew Morton   memcg: rename con...
2645
  #ifdef CONFIG_MEMCG_SWAP
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2646
2647
  static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
  					 bool charge)
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2648
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2649
2650
  	int val = (charge) ? 1 : -1;
  	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2651
  }
024914477   Daisuke Nishimura   memcg: move charg...
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
  
  /**
   * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
   * @entry: swap entry to be moved
   * @from:  mem_cgroup which the entry is moved from
   * @to:  mem_cgroup which the entry is moved to
   *
   * It succeeds only when the swap_cgroup's record for this entry is the same
   * as the mem_cgroup's id of @from.
   *
   * Returns 0 on success, -EINVAL on failure.
   *
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2664
   * The caller must have charged to @to, IOW, called page_counter_charge() about
024914477   Daisuke Nishimura   memcg: move charg...
2665
2666
2667
   * both res and memsw, and called css_get().
   */
  static int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
2668
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
2669
2670
  {
  	unsigned short old_id, new_id;
34c00c319   Li Zefan   memcg: convert to...
2671
2672
  	old_id = mem_cgroup_id(from);
  	new_id = mem_cgroup_id(to);
024914477   Daisuke Nishimura   memcg: move charg...
2673
2674
  
  	if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
024914477   Daisuke Nishimura   memcg: move charg...
2675
  		mem_cgroup_swap_statistics(from, false);
483c30b51   Daisuke Nishimura   memcg: improve pe...
2676
  		mem_cgroup_swap_statistics(to, true);
024914477   Daisuke Nishimura   memcg: move charg...
2677
2678
2679
2680
2681
2682
  		return 0;
  	}
  	return -EINVAL;
  }
  #else
  static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
e91cbb425   Hugh Dickins   memcg swap: mem_c...
2683
  				struct mem_cgroup *from, struct mem_cgroup *to)
024914477   Daisuke Nishimura   memcg: move charg...
2684
2685
2686
  {
  	return -EINVAL;
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2687
  #endif
d13d14430   KAMEZAWA Hiroyuki   memcg: handle swa...
2688

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2689
  static DEFINE_MUTEX(memcg_limit_mutex);
f212ad7cf   Daisuke Nishimura   memcg: add memcg ...
2690

d38d2a758   KOSAKI Motohiro   mm: make mem_cgro...
2691
  static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2692
  				   unsigned long limit)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2693
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2694
2695
2696
  	unsigned long curusage;
  	unsigned long oldusage;
  	bool enlarge = false;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2697
  	int retry_count;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2698
  	int ret;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2699
2700
2701
2702
2703
2704
  
  	/*
  	 * For keeping hierarchical_reclaim simple, how long we should retry
  	 * is depends on callers. We set our retry-count to be function
  	 * of # of children which we should visit in this loop.
  	 */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2705
2706
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
  		      mem_cgroup_count_children(memcg);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2707

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2708
  	oldusage = page_counter_read(&memcg->memory);
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2709

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2710
  	do {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2711
2712
2713
2714
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2715
2716
2717
2718
  
  		mutex_lock(&memcg_limit_mutex);
  		if (limit > memcg->memsw.limit) {
  			mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2719
  			ret = -EINVAL;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2720
2721
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2722
2723
2724
2725
  		if (limit > memcg->memory.limit)
  			enlarge = true;
  		ret = page_counter_limit(&memcg->memory, limit);
  		mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2726
2727
2728
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2729
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2730
  		curusage = page_counter_read(&memcg->memory);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2731
  		/* Usage is reduced ? */
f894ffa86   Andrew Morton   memcg: trivial cl...
2732
  		if (curusage >= oldusage)
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2733
2734
2735
  			retry_count--;
  		else
  			oldusage = curusage;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2736
  	} while (retry_count);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2737
2738
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
14797e236   KOSAKI Motohiro   memcg: add inacti...
2739

8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2740
2741
  	return ret;
  }
338c84310   Li Zefan   memcg: remove som...
2742
  static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2743
  					 unsigned long limit)
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2744
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2745
2746
2747
  	unsigned long curusage;
  	unsigned long oldusage;
  	bool enlarge = false;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2748
  	int retry_count;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2749
  	int ret;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2750

81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2751
  	/* see mem_cgroup_resize_res_limit */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2752
2753
2754
2755
2756
2757
  	retry_count = MEM_CGROUP_RECLAIM_RETRIES *
  		      mem_cgroup_count_children(memcg);
  
  	oldusage = page_counter_read(&memcg->memsw);
  
  	do {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2758
2759
2760
2761
  		if (signal_pending(current)) {
  			ret = -EINTR;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2762
2763
2764
2765
  
  		mutex_lock(&memcg_limit_mutex);
  		if (limit < memcg->memory.limit) {
  			mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2766
  			ret = -EINVAL;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2767
2768
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2769
2770
2771
2772
  		if (limit > memcg->memsw.limit)
  			enlarge = true;
  		ret = page_counter_limit(&memcg->memsw, limit);
  		mutex_unlock(&memcg_limit_mutex);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2773
2774
2775
  
  		if (!ret)
  			break;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2776
  		try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2777
  		curusage = page_counter_read(&memcg->memsw);
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2778
  		/* Usage is reduced ? */
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
2779
  		if (curusage >= oldusage)
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2780
  			retry_count--;
81d39c20f   KAMEZAWA Hiroyuki   memcg: fix shrink...
2781
2782
  		else
  			oldusage = curusage;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2783
  	} while (retry_count);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
2784
2785
  	if (!ret && enlarge)
  		memcg_oom_recover(memcg);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2786

628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
2787
2788
  	return ret;
  }
0608f43da   Andrew Morton   revert "memcg, vm...
2789
2790
2791
2792
2793
2794
2795
2796
2797
  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
  					    gfp_t gfp_mask,
  					    unsigned long *total_scanned)
  {
  	unsigned long nr_reclaimed = 0;
  	struct mem_cgroup_per_zone *mz, *next_mz = NULL;
  	unsigned long reclaimed;
  	int loop = 0;
  	struct mem_cgroup_tree_per_zone *mctz;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2798
  	unsigned long excess;
0608f43da   Andrew Morton   revert "memcg, vm...
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
  	unsigned long nr_scanned;
  
  	if (order > 0)
  		return 0;
  
  	mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
  	/*
  	 * This loop can run a while, specially if mem_cgroup's continuously
  	 * keep exceeding their soft limit and putting the system under
  	 * pressure
  	 */
  	do {
  		if (next_mz)
  			mz = next_mz;
  		else
  			mz = mem_cgroup_largest_soft_limit_node(mctz);
  		if (!mz)
  			break;
  
  		nr_scanned = 0;
  		reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
  						    gfp_mask, &nr_scanned);
  		nr_reclaimed += reclaimed;
  		*total_scanned += nr_scanned;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2823
  		spin_lock_irq(&mctz->lock);
bc2f2e7ff   Vladimir Davydov   memcg: simplify u...
2824
  		__mem_cgroup_remove_exceeded(mz, mctz);
0608f43da   Andrew Morton   revert "memcg, vm...
2825
2826
2827
2828
2829
2830
  
  		/*
  		 * If we failed to reclaim anything from this memory cgroup
  		 * it is time to move on to the next cgroup
  		 */
  		next_mz = NULL;
bc2f2e7ff   Vladimir Davydov   memcg: simplify u...
2831
2832
  		if (!reclaimed)
  			next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2833
  		excess = soft_limit_excess(mz->memcg);
0608f43da   Andrew Morton   revert "memcg, vm...
2834
2835
2836
2837
2838
2839
2840
2841
2842
  		/*
  		 * One school of thought says that we should not add
  		 * back the node to the tree if reclaim returns 0.
  		 * But our reclaim could return 0, simply because due
  		 * to priority we are exposing a smaller subset of
  		 * memory to reclaim from. Consider this as a longer
  		 * term TODO.
  		 */
  		/* If excess == 0, no tree ops */
cf2c81279   Johannes Weiner   mm: memcontrol: r...
2843
  		__mem_cgroup_insert_exceeded(mz, mctz, excess);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
2844
  		spin_unlock_irq(&mctz->lock);
0608f43da   Andrew Morton   revert "memcg, vm...
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
  		css_put(&mz->memcg->css);
  		loop++;
  		/*
  		 * Could not reclaim anything and there are no more
  		 * mem cgroups to try or we seem to be looping without
  		 * reclaiming anything.
  		 */
  		if (!nr_reclaimed &&
  			(next_mz == NULL ||
  			loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
  			break;
  	} while (!nr_reclaimed);
  	if (next_mz)
  		css_put(&next_mz->memcg->css);
  	return nr_reclaimed;
  }
ea280e7b4   Tejun Heo   memcg: update mem...
2861
2862
2863
2864
2865
2866
  /*
   * Test whether @memcg has children, dead or alive.  Note that this
   * function doesn't care whether @memcg has use_hierarchy enabled and
   * returns %true if there are child csses according to the cgroup
   * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
   */
b5f99b537   Glauber Costa   memcg: fast hiera...
2867
2868
  static inline bool memcg_has_children(struct mem_cgroup *memcg)
  {
ea280e7b4   Tejun Heo   memcg: update mem...
2869
  	bool ret;
696ac172f   Johannes Weiner   mm: memcg: fix te...
2870
  	/*
ea280e7b4   Tejun Heo   memcg: update mem...
2871
2872
2873
2874
  	 * The lock does not prevent addition or deletion of children, but
  	 * it prevents a new child from being initialized based on this
  	 * parent in css_online(), so it's enough to decide whether
  	 * hierarchically inherited attributes can still be changed or not.
696ac172f   Johannes Weiner   mm: memcg: fix te...
2875
  	 */
ea280e7b4   Tejun Heo   memcg: update mem...
2876
2877
2878
2879
2880
2881
  	lockdep_assert_held(&memcg_create_mutex);
  
  	rcu_read_lock();
  	ret = css_next_child(NULL, &memcg->css);
  	rcu_read_unlock();
  	return ret;
b5f99b537   Glauber Costa   memcg: fast hiera...
2882
2883
2884
  }
  
  /*
c26251f9f   Michal Hocko   memcg: split mem_...
2885
2886
2887
2888
2889
2890
2891
2892
   * Reclaims as many pages from the given memcg as possible and moves
   * the rest to the parent.
   *
   * Caller is responsible for holding css reference for memcg.
   */
  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
  {
  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
c26251f9f   Michal Hocko   memcg: split mem_...
2893

c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2894
2895
  	/* we call try-to-free pages for make this cgroup empty */
  	lru_add_drain_all();
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2896
  	/* try to free all pages in this cgroup */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2897
  	while (nr_retries && page_counter_read(&memcg->memory)) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2898
  		int progress;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2899

c26251f9f   Michal Hocko   memcg: split mem_...
2900
2901
  		if (signal_pending(current))
  			return -EINTR;
b70a2a21d   Johannes Weiner   mm: memcontrol: f...
2902
2903
  		progress = try_to_free_mem_cgroup_pages(memcg, 1,
  							GFP_KERNEL, true);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2904
  		if (!progress) {
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2905
  			nr_retries--;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2906
  			/* maybe some writeback is necessary */
8aa7e847d   Jens Axboe   Fix congestion_wa...
2907
  			congestion_wait(BLK_RW_ASYNC, HZ/10);
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2908
  		}
f817ed485   KAMEZAWA Hiroyuki   memcg: move all a...
2909
2910
  
  	}
ab5196c20   Michal Hocko   memcg: make mem_c...
2911
2912
  
  	return 0;
cc8475822   KAMEZAWA Hiroyuki   memory cgroup enh...
2913
  }
6770c64e5   Tejun Heo   cgroup: replace c...
2914
2915
2916
  static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
  					    char *buf, size_t nbytes,
  					    loff_t off)
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2917
  {
6770c64e5   Tejun Heo   cgroup: replace c...
2918
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
c26251f9f   Michal Hocko   memcg: split mem_...
2919

d84230118   Michal Hocko   memcg: root_cgrou...
2920
2921
  	if (mem_cgroup_is_root(memcg))
  		return -EINVAL;
6770c64e5   Tejun Heo   cgroup: replace c...
2922
  	return mem_cgroup_force_empty(memcg) ?: nbytes;
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
2923
  }
182446d08   Tejun Heo   cgroup: pass arou...
2924
2925
  static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
  				     struct cftype *cft)
18f59ea7d   Balbir Singh   memcg: memory cgr...
2926
  {
182446d08   Tejun Heo   cgroup: pass arou...
2927
  	return mem_cgroup_from_css(css)->use_hierarchy;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2928
  }
182446d08   Tejun Heo   cgroup: pass arou...
2929
2930
  static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
  				      struct cftype *cft, u64 val)
18f59ea7d   Balbir Singh   memcg: memory cgr...
2931
2932
  {
  	int retval = 0;
182446d08   Tejun Heo   cgroup: pass arou...
2933
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5c9d535b8   Tejun Heo   cgroup: remove cs...
2934
  	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
18f59ea7d   Balbir Singh   memcg: memory cgr...
2935

0999821b1   Glauber Costa   memcg: replace cg...
2936
  	mutex_lock(&memcg_create_mutex);
567fb435b   Glauber Costa   memcg: fix bad be...
2937
2938
2939
  
  	if (memcg->use_hierarchy == val)
  		goto out;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2940
  	/*
af901ca18   André Goddard Rosa   tree-wide: fix as...
2941
  	 * If parent's use_hierarchy is set, we can't make any modifications
18f59ea7d   Balbir Singh   memcg: memory cgr...
2942
2943
2944
2945
2946
2947
  	 * in the child subtrees. If it is unset, then the change can
  	 * occur, provided the current cgroup has no children.
  	 *
  	 * For the root cgroup, parent_mem is NULL, we allow value to be
  	 * set if there are no children.
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
2948
  	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
18f59ea7d   Balbir Singh   memcg: memory cgr...
2949
  				(val == 1 || val == 0)) {
ea280e7b4   Tejun Heo   memcg: update mem...
2950
  		if (!memcg_has_children(memcg))
c0ff4b854   Raghavendra K T   memcg: rename mem...
2951
  			memcg->use_hierarchy = val;
18f59ea7d   Balbir Singh   memcg: memory cgr...
2952
2953
2954
2955
  		else
  			retval = -EBUSY;
  	} else
  		retval = -EINVAL;
567fb435b   Glauber Costa   memcg: fix bad be...
2956
2957
  
  out:
0999821b1   Glauber Costa   memcg: replace cg...
2958
  	mutex_unlock(&memcg_create_mutex);
18f59ea7d   Balbir Singh   memcg: memory cgr...
2959
2960
2961
  
  	return retval;
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2962
2963
  static unsigned long tree_stat(struct mem_cgroup *memcg,
  			       enum mem_cgroup_stat_index idx)
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
  {
  	struct mem_cgroup *iter;
  	long val = 0;
  
  	/* Per-cpu values can be negative, use a signed accumulator */
  	for_each_mem_cgroup_tree(iter, memcg)
  		val += mem_cgroup_read_stat(iter, idx);
  
  	if (val < 0) /* race ? */
  		val = 0;
  	return val;
  }
  
  static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
  {
  	u64 val;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2980
2981
2982
2983
2984
2985
  	if (mem_cgroup_is_root(memcg)) {
  		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
  		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
  		if (swap)
  			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
  	} else {
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2986
  		if (!swap)
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2987
  			val = page_counter_read(&memcg->memory);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2988
  		else
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2989
  			val = page_counter_read(&memcg->memsw);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2990
  	}
ce00a9673   Johannes Weiner   mm: memcontrol: r...
2991
2992
  	return val << PAGE_SHIFT;
  }
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
2993
2994
2995
2996
2997
2998
2999
  enum {
  	RES_USAGE,
  	RES_LIMIT,
  	RES_MAX_USAGE,
  	RES_FAILCNT,
  	RES_SOFT_LIMIT,
  };
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3000

791badbdb   Tejun Heo   memcg: convert aw...
3001
  static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
05b843012   Johannes Weiner   mm: memcontrol: u...
3002
  			       struct cftype *cft)
8cdea7c05   Balbir Singh   Memory controller...
3003
  {
182446d08   Tejun Heo   cgroup: pass arou...
3004
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3005
  	struct page_counter *counter;
af36f906c   Tejun Heo   memcg: always cre...
3006

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3007
  	switch (MEMFILE_TYPE(cft->private)) {
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3008
  	case _MEM:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3009
3010
  		counter = &memcg->memory;
  		break;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3011
  	case _MEMSWAP:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3012
3013
  		counter = &memcg->memsw;
  		break;
510fc4e11   Glauber Costa   memcg: kmem accou...
3014
  	case _KMEM:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3015
  		counter = &memcg->kmem;
510fc4e11   Glauber Costa   memcg: kmem accou...
3016
  		break;
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3017
3018
  	default:
  		BUG();
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3019
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
  
  	switch (MEMFILE_ATTR(cft->private)) {
  	case RES_USAGE:
  		if (counter == &memcg->memory)
  			return mem_cgroup_usage(memcg, false);
  		if (counter == &memcg->memsw)
  			return mem_cgroup_usage(memcg, true);
  		return (u64)page_counter_read(counter) * PAGE_SIZE;
  	case RES_LIMIT:
  		return (u64)counter->limit * PAGE_SIZE;
  	case RES_MAX_USAGE:
  		return (u64)counter->watermark * PAGE_SIZE;
  	case RES_FAILCNT:
  		return counter->failcnt;
  	case RES_SOFT_LIMIT:
  		return (u64)memcg->soft_limit * PAGE_SIZE;
  	default:
  		BUG();
  	}
8cdea7c05   Balbir Singh   Memory controller...
3039
  }
510fc4e11   Glauber Costa   memcg: kmem accou...
3040

510fc4e11   Glauber Costa   memcg: kmem accou...
3041
  #ifdef CONFIG_MEMCG_KMEM
8c0145b62   Vladimir Davydov   memcg: remove act...
3042
3043
  static int memcg_activate_kmem(struct mem_cgroup *memcg,
  			       unsigned long nr_pages)
d64416377   Vladimir Davydov   memcg: rework mem...
3044
3045
3046
  {
  	int err = 0;
  	int memcg_id;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3047
  	BUG_ON(memcg->kmemcg_id >= 0);
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3048
  	BUG_ON(memcg->kmem_acct_activated);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3049
  	BUG_ON(memcg->kmem_acct_active);
d64416377   Vladimir Davydov   memcg: rework mem...
3050
3051
  
  	/*
510fc4e11   Glauber Costa   memcg: kmem accou...
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
  	 * For simplicity, we won't allow this to be disabled.  It also can't
  	 * be changed if the cgroup has children already, or if tasks had
  	 * already joined.
  	 *
  	 * If tasks join before we set the limit, a person looking at
  	 * kmem.usage_in_bytes will have no way to determine when it took
  	 * place, which makes the value quite meaningless.
  	 *
  	 * After it first became limited, changes in the value of the limit are
  	 * of course permitted.
510fc4e11   Glauber Costa   memcg: kmem accou...
3062
  	 */
0999821b1   Glauber Costa   memcg: replace cg...
3063
  	mutex_lock(&memcg_create_mutex);
ea280e7b4   Tejun Heo   memcg: update mem...
3064
3065
  	if (cgroup_has_tasks(memcg->css.cgroup) ||
  	    (memcg->use_hierarchy && memcg_has_children(memcg)))
d64416377   Vladimir Davydov   memcg: rework mem...
3066
3067
3068
3069
  		err = -EBUSY;
  	mutex_unlock(&memcg_create_mutex);
  	if (err)
  		goto out;
510fc4e11   Glauber Costa   memcg: kmem accou...
3070

f3bb3043a   Vladimir Davydov   memcg: don't call...
3071
  	memcg_id = memcg_alloc_cache_id();
d64416377   Vladimir Davydov   memcg: rework mem...
3072
3073
3074
3075
  	if (memcg_id < 0) {
  		err = memcg_id;
  		goto out;
  	}
d64416377   Vladimir Davydov   memcg: rework mem...
3076
  	/*
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3077
3078
  	 * We couldn't have accounted to this cgroup, because it hasn't got
  	 * activated yet, so this should succeed.
d64416377   Vladimir Davydov   memcg: rework mem...
3079
  	 */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3080
  	err = page_counter_limit(&memcg->kmem, nr_pages);
d64416377   Vladimir Davydov   memcg: rework mem...
3081
3082
3083
3084
  	VM_BUG_ON(err);
  
  	static_key_slow_inc(&memcg_kmem_enabled_key);
  	/*
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3085
3086
  	 * A memory cgroup is considered kmem-active as soon as it gets
  	 * kmemcg_id. Setting the id after enabling static branching will
d64416377   Vladimir Davydov   memcg: rework mem...
3087
3088
3089
  	 * guarantee no one starts accounting before all call sites are
  	 * patched.
  	 */
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
3090
  	memcg->kmemcg_id = memcg_id;
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3091
  	memcg->kmem_acct_activated = true;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3092
  	memcg->kmem_acct_active = true;
510fc4e11   Glauber Costa   memcg: kmem accou...
3093
  out:
d64416377   Vladimir Davydov   memcg: rework mem...
3094
  	return err;
d64416377   Vladimir Davydov   memcg: rework mem...
3095
  }
d64416377   Vladimir Davydov   memcg: rework mem...
3096
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3097
  				   unsigned long limit)
d64416377   Vladimir Davydov   memcg: rework mem...
3098
3099
  {
  	int ret;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3100
  	mutex_lock(&memcg_limit_mutex);
d64416377   Vladimir Davydov   memcg: rework mem...
3101
  	if (!memcg_kmem_is_active(memcg))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3102
  		ret = memcg_activate_kmem(memcg, limit);
d64416377   Vladimir Davydov   memcg: rework mem...
3103
  	else
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3104
3105
  		ret = page_counter_limit(&memcg->kmem, limit);
  	mutex_unlock(&memcg_limit_mutex);
510fc4e11   Glauber Costa   memcg: kmem accou...
3106
3107
  	return ret;
  }
55007d849   Glauber Costa   memcg: allocate m...
3108
  static int memcg_propagate_kmem(struct mem_cgroup *memcg)
510fc4e11   Glauber Costa   memcg: kmem accou...
3109
  {
55007d849   Glauber Costa   memcg: allocate m...
3110
  	int ret = 0;
510fc4e11   Glauber Costa   memcg: kmem accou...
3111
  	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
55007d849   Glauber Costa   memcg: allocate m...
3112

d64416377   Vladimir Davydov   memcg: rework mem...
3113
3114
  	if (!parent)
  		return 0;
55007d849   Glauber Costa   memcg: allocate m...
3115

8c0145b62   Vladimir Davydov   memcg: remove act...
3116
  	mutex_lock(&memcg_limit_mutex);
55007d849   Glauber Costa   memcg: allocate m...
3117
  	/*
d64416377   Vladimir Davydov   memcg: rework mem...
3118
3119
  	 * If the parent cgroup is not kmem-active now, it cannot be activated
  	 * after this point, because it has at least one child already.
55007d849   Glauber Costa   memcg: allocate m...
3120
  	 */
d64416377   Vladimir Davydov   memcg: rework mem...
3121
  	if (memcg_kmem_is_active(parent))
8c0145b62   Vladimir Davydov   memcg: remove act...
3122
3123
  		ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
  	mutex_unlock(&memcg_limit_mutex);
55007d849   Glauber Costa   memcg: allocate m...
3124
  	return ret;
510fc4e11   Glauber Costa   memcg: kmem accou...
3125
  }
d64416377   Vladimir Davydov   memcg: rework mem...
3126
3127
  #else
  static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3128
  				   unsigned long limit)
d64416377   Vladimir Davydov   memcg: rework mem...
3129
3130
3131
  {
  	return -EINVAL;
  }
6d0439904   Hugh Dickins   memcg: stop warni...
3132
  #endif /* CONFIG_MEMCG_KMEM */
510fc4e11   Glauber Costa   memcg: kmem accou...
3133

628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3134
3135
3136
3137
  /*
   * The user of this function is...
   * RES_LIMIT.
   */
451af504d   Tejun Heo   cgroup: replace c...
3138
3139
  static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
8cdea7c05   Balbir Singh   Memory controller...
3140
  {
451af504d   Tejun Heo   cgroup: replace c...
3141
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3142
  	unsigned long nr_pages;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3143
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
3144
  	buf = strstrip(buf);
650c5e565   Johannes Weiner   mm: page_counter:...
3145
  	ret = page_counter_memparse(buf, "-1", &nr_pages);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3146
3147
  	if (ret)
  		return ret;
af36f906c   Tejun Heo   memcg: always cre...
3148

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3149
  	switch (MEMFILE_ATTR(of_cft(of)->private)) {
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3150
  	case RES_LIMIT:
4b3bde4c9   Balbir Singh   memcg: remove the...
3151
3152
3153
3154
  		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
  			ret = -EINVAL;
  			break;
  		}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3155
3156
3157
  		switch (MEMFILE_TYPE(of_cft(of)->private)) {
  		case _MEM:
  			ret = mem_cgroup_resize_limit(memcg, nr_pages);
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
3158
  			break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3159
3160
  		case _MEMSWAP:
  			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
296c81d89   Balbir Singh   memory controller...
3161
  			break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3162
3163
3164
3165
  		case _KMEM:
  			ret = memcg_update_kmem_limit(memcg, nr_pages);
  			break;
  		}
296c81d89   Balbir Singh   memory controller...
3166
  		break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3167
3168
3169
  	case RES_SOFT_LIMIT:
  		memcg->soft_limit = nr_pages;
  		ret = 0;
628f42355   KAMEZAWA Hiroyuki   memcg: limit chan...
3170
3171
  		break;
  	}
451af504d   Tejun Heo   cgroup: replace c...
3172
  	return ret ?: nbytes;
8cdea7c05   Balbir Singh   Memory controller...
3173
  }
6770c64e5   Tejun Heo   cgroup: replace c...
3174
3175
  static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
  				size_t nbytes, loff_t off)
c84872e16   Pavel Emelyanov   memcgroup: add th...
3176
  {
6770c64e5   Tejun Heo   cgroup: replace c...
3177
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3178
  	struct page_counter *counter;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3179

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
  	switch (MEMFILE_TYPE(of_cft(of)->private)) {
  	case _MEM:
  		counter = &memcg->memory;
  		break;
  	case _MEMSWAP:
  		counter = &memcg->memsw;
  		break;
  	case _KMEM:
  		counter = &memcg->kmem;
  		break;
  	default:
  		BUG();
  	}
af36f906c   Tejun Heo   memcg: always cre...
3193

3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3194
  	switch (MEMFILE_ATTR(of_cft(of)->private)) {
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3195
  	case RES_MAX_USAGE:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3196
  		page_counter_reset_watermark(counter);
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3197
3198
  		break;
  	case RES_FAILCNT:
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3199
  		counter->failcnt = 0;
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3200
  		break;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3201
3202
  	default:
  		BUG();
29f2a4dac   Pavel Emelyanov   memcgroup: implem...
3203
  	}
f64c3f549   Balbir Singh   memory controller...
3204

6770c64e5   Tejun Heo   cgroup: replace c...
3205
  	return nbytes;
c84872e16   Pavel Emelyanov   memcgroup: add th...
3206
  }
182446d08   Tejun Heo   cgroup: pass arou...
3207
  static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
3208
3209
  					struct cftype *cft)
  {
182446d08   Tejun Heo   cgroup: pass arou...
3210
  	return mem_cgroup_from_css(css)->move_charge_at_immigrate;
7dc74be03   Daisuke Nishimura   memcg: add interf...
3211
  }
024914477   Daisuke Nishimura   memcg: move charg...
3212
  #ifdef CONFIG_MMU
182446d08   Tejun Heo   cgroup: pass arou...
3213
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
7dc74be03   Daisuke Nishimura   memcg: add interf...
3214
3215
  					struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
3216
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7dc74be03   Daisuke Nishimura   memcg: add interf...
3217

1dfab5abc   Johannes Weiner   mm: memcontrol: f...
3218
  	if (val & ~MOVE_MASK)
7dc74be03   Daisuke Nishimura   memcg: add interf...
3219
  		return -EINVAL;
ee5e8472b   Glauber Costa   memcg: prevent ch...
3220

7dc74be03   Daisuke Nishimura   memcg: add interf...
3221
  	/*
ee5e8472b   Glauber Costa   memcg: prevent ch...
3222
3223
3224
3225
  	 * No kind of locking is needed in here, because ->can_attach() will
  	 * check this value once in the beginning of the process, and then carry
  	 * on with stale data. This means that changes to this value will only
  	 * affect task migrations starting after the change.
7dc74be03   Daisuke Nishimura   memcg: add interf...
3226
  	 */
c0ff4b854   Raghavendra K T   memcg: rename mem...
3227
  	memcg->move_charge_at_immigrate = val;
7dc74be03   Daisuke Nishimura   memcg: add interf...
3228
3229
  	return 0;
  }
024914477   Daisuke Nishimura   memcg: move charg...
3230
  #else
182446d08   Tejun Heo   cgroup: pass arou...
3231
  static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
024914477   Daisuke Nishimura   memcg: move charg...
3232
3233
3234
3235
3236
  					struct cftype *cft, u64 val)
  {
  	return -ENOSYS;
  }
  #endif
7dc74be03   Daisuke Nishimura   memcg: add interf...
3237

406eb0c9b   Ying Han   memcg: add memory...
3238
  #ifdef CONFIG_NUMA
2da8ca822   Tejun Heo   cgroup: replace c...
3239
  static int memcg_numa_stat_show(struct seq_file *m, void *v)
406eb0c9b   Ying Han   memcg: add memory...
3240
  {
25485de6e   Greg Thelen   memcg: refactor m...
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
  	struct numa_stat {
  		const char *name;
  		unsigned int lru_mask;
  	};
  
  	static const struct numa_stat stats[] = {
  		{ "total", LRU_ALL },
  		{ "file", LRU_ALL_FILE },
  		{ "anon", LRU_ALL_ANON },
  		{ "unevictable", BIT(LRU_UNEVICTABLE) },
  	};
  	const struct numa_stat *stat;
406eb0c9b   Ying Han   memcg: add memory...
3253
  	int nid;
25485de6e   Greg Thelen   memcg: refactor m...
3254
  	unsigned long nr;
2da8ca822   Tejun Heo   cgroup: replace c...
3255
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
406eb0c9b   Ying Han   memcg: add memory...
3256

25485de6e   Greg Thelen   memcg: refactor m...
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
  		seq_printf(m, "%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
  							  stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
3267
  	}
406eb0c9b   Ying Han   memcg: add memory...
3268

071aee138   Ying Han   memcg: support hi...
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
  	for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
  		struct mem_cgroup *iter;
  
  		nr = 0;
  		for_each_mem_cgroup_tree(iter, memcg)
  			nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
  		seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
  		for_each_node_state(nid, N_MEMORY) {
  			nr = 0;
  			for_each_mem_cgroup_tree(iter, memcg)
  				nr += mem_cgroup_node_nr_lru_pages(
  					iter, nid, stat->lru_mask);
  			seq_printf(m, " N%d=%lu", nid, nr);
  		}
  		seq_putc(m, '
  ');
406eb0c9b   Ying Han   memcg: add memory...
3285
  	}
406eb0c9b   Ying Han   memcg: add memory...
3286

406eb0c9b   Ying Han   memcg: add memory...
3287
3288
3289
  	return 0;
  }
  #endif /* CONFIG_NUMA */
2da8ca822   Tejun Heo   cgroup: replace c...
3290
  static int memcg_stat_show(struct seq_file *m, void *v)
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3291
  {
2da8ca822   Tejun Heo   cgroup: replace c...
3292
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3293
  	unsigned long memory, memsw;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3294
3295
  	struct mem_cgroup *mi;
  	unsigned int i;
406eb0c9b   Ying Han   memcg: add memory...
3296

0ca44b148   Greg Thelen   memcg: add BUILD_...
3297
3298
3299
3300
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
  		     MEM_CGROUP_STAT_NSTATS);
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
  		     MEM_CGROUP_EVENTS_NSTATS);
70bc068c4   Rickard Strandqvist   mm/memcontrol.c: ...
3301
  	BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3302
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
3303
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3304
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3305
3306
3307
  		seq_printf(m, "%s %ld
  ", mem_cgroup_stat_names[i],
  			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3308
  	}
7b854121e   Lee Schermerhorn   Unevictable LRU P...
3309

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3310
3311
3312
3313
3314
3315
3316
3317
3318
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_events_names[i],
  			   mem_cgroup_read_events(memcg, i));
  
  	for (i = 0; i < NR_LRU_LISTS; i++)
  		seq_printf(m, "%s %lu
  ", mem_cgroup_lru_names[i],
  			   mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3319
  	/* Hierarchical information */
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3320
3321
3322
3323
  	memory = memsw = PAGE_COUNTER_MAX;
  	for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
  		memory = min(memory, mi->memory.limit);
  		memsw = min(memsw, mi->memsw.limit);
fee7b548e   KAMEZAWA Hiroyuki   memcg: show real ...
3324
  	}
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3325
3326
3327
3328
3329
3330
3331
  	seq_printf(m, "hierarchical_memory_limit %llu
  ",
  		   (u64)memory * PAGE_SIZE);
  	if (do_swap_account)
  		seq_printf(m, "hierarchical_memsw_limit %llu
  ",
  			   (u64)memsw * PAGE_SIZE);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3332

af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3333
3334
  	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
  		long long val = 0;
bff6bb83f   Kamezawa Hiroyuki   memcg: rename MEM...
3335
  		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3336
  			continue;
af7c4b0ec   Johannes Weiner   mm: memcg: print ...
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
  		seq_printf(m, "total_%s %lld
  ", mem_cgroup_stat_names[i], val);
  	}
  
  	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_read_events(mi, i);
  		seq_printf(m, "total_%s %llu
  ",
  			   mem_cgroup_events_names[i], val);
  	}
  
  	for (i = 0; i < NR_LRU_LISTS; i++) {
  		unsigned long long val = 0;
  
  		for_each_mem_cgroup_tree(mi, memcg)
  			val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
  		seq_printf(m, "total_%s %llu
  ", mem_cgroup_lru_names[i], val);
1dd3a2732   Daisuke Nishimura   memcg: show swap ...
3360
  	}
14067bb3e   KAMEZAWA Hiroyuki   memcg: hierarchic...
3361

7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3362
  #ifdef CONFIG_DEBUG_VM
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3363
3364
3365
  	{
  		int nid, zid;
  		struct mem_cgroup_per_zone *mz;
89abfab13   Hugh Dickins   mm/memcg: move re...
3366
  		struct zone_reclaim_stat *rstat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3367
3368
3369
3370
3371
  		unsigned long recent_rotated[2] = {0, 0};
  		unsigned long recent_scanned[2] = {0, 0};
  
  		for_each_online_node(nid)
  			for (zid = 0; zid < MAX_NR_ZONES; zid++) {
e231875ba   Jianyu Zhan   mm: memcontrol: c...
3372
  				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
89abfab13   Hugh Dickins   mm/memcg: move re...
3373
  				rstat = &mz->lruvec.reclaim_stat;
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3374

89abfab13   Hugh Dickins   mm/memcg: move re...
3375
3376
3377
3378
  				recent_rotated[0] += rstat->recent_rotated[0];
  				recent_rotated[1] += rstat->recent_rotated[1];
  				recent_scanned[0] += rstat->recent_scanned[0];
  				recent_scanned[1] += rstat->recent_scanned[1];
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3379
  			}
78ccf5b5a   Johannes Weiner   mm: memcg: print ...
3380
3381
3382
3383
3384
3385
3386
3387
  		seq_printf(m, "recent_rotated_anon %lu
  ", recent_rotated[0]);
  		seq_printf(m, "recent_rotated_file %lu
  ", recent_rotated[1]);
  		seq_printf(m, "recent_scanned_anon %lu
  ", recent_scanned[0]);
  		seq_printf(m, "recent_scanned_file %lu
  ", recent_scanned[1]);
7f016ee8b   KOSAKI Motohiro   memcg: show recla...
3388
3389
  	}
  #endif
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
3390
3391
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
3392
3393
  static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
  				      struct cftype *cft)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3394
  {
182446d08   Tejun Heo   cgroup: pass arou...
3395
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3396

1f4c025b5   KAMEZAWA Hiroyuki   memcg: export mem...
3397
  	return mem_cgroup_swappiness(memcg);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3398
  }
182446d08   Tejun Heo   cgroup: pass arou...
3399
3400
  static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
  				       struct cftype *cft, u64 val)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3401
  {
182446d08   Tejun Heo   cgroup: pass arou...
3402
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3403

3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3404
  	if (val > 100)
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3405
  		return -EINVAL;
14208b0ec   Linus Torvalds   Merge branch 'for...
3406
  	if (css->parent)
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3407
3408
3409
  		memcg->swappiness = val;
  	else
  		vm_swappiness = val;
068b38c1f   Li Zefan   memcg: fix a race...
3410

a7885eb8a   KOSAKI Motohiro   memcg: swappiness
3411
3412
  	return 0;
  }
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3413
3414
3415
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
  	struct mem_cgroup_threshold_ary *t;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3416
  	unsigned long usage;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3417
3418
3419
3420
  	int i;
  
  	rcu_read_lock();
  	if (!swap)
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3421
  		t = rcu_dereference(memcg->thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3422
  	else
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3423
  		t = rcu_dereference(memcg->memsw_thresholds.primary);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3424
3425
3426
  
  	if (!t)
  		goto unlock;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3427
  	usage = mem_cgroup_usage(memcg, swap);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3428
3429
  
  	/*
748dad36d   Sha Zhengju   memcg: make thres...
3430
  	 * current_threshold points to threshold just below or equal to usage.
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3431
3432
3433
  	 * If it's not true, a threshold was crossed after last
  	 * call of __mem_cgroup_threshold().
  	 */
5407a5625   Phil Carmody   mm: remove unnece...
3434
  	i = t->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
  
  	/*
  	 * Iterate backward over array of thresholds starting from
  	 * current_threshold and check if a threshold is crossed.
  	 * If none of thresholds below usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* i = current_threshold + 1 */
  	i++;
  
  	/*
  	 * Iterate forward over array of thresholds starting from
  	 * current_threshold+1 and check if a threshold is crossed.
  	 * If none of thresholds above usage is crossed, we read
  	 * only one element of the array here.
  	 */
  	for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
  		eventfd_signal(t->entries[i].eventfd, 1);
  
  	/* Update current_threshold */
5407a5625   Phil Carmody   mm: remove unnece...
3458
  	t->current_threshold = i - 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3459
3460
3461
3462
3463
3464
  unlock:
  	rcu_read_unlock();
  }
  
  static void mem_cgroup_threshold(struct mem_cgroup *memcg)
  {
ad4ca5f4b   Kirill A. Shutemov   memcg: fix thresh...
3465
3466
3467
3468
3469
3470
3471
  	while (memcg) {
  		__mem_cgroup_threshold(memcg, false);
  		if (do_swap_account)
  			__mem_cgroup_threshold(memcg, true);
  
  		memcg = parent_mem_cgroup(memcg);
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3472
3473
3474
3475
3476
3477
  }
  
  static int compare_thresholds(const void *a, const void *b)
  {
  	const struct mem_cgroup_threshold *_a = a;
  	const struct mem_cgroup_threshold *_b = b;
2bff24a37   Greg Thelen   memcg: fix multip...
3478
3479
3480
3481
3482
3483
3484
  	if (_a->threshold > _b->threshold)
  		return 1;
  
  	if (_a->threshold < _b->threshold)
  		return -1;
  
  	return 0;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3485
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3486
  static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3487
3488
  {
  	struct mem_cgroup_eventfd_list *ev;
2bcf2e92c   Michal Hocko   memcg: oom_notify...
3489
  	spin_lock(&memcg_oom_lock);
c0ff4b854   Raghavendra K T   memcg: rename mem...
3490
  	list_for_each_entry(ev, &memcg->oom_notify, list)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3491
  		eventfd_signal(ev->eventfd, 1);
2bcf2e92c   Michal Hocko   memcg: oom_notify...
3492
3493
  
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3494
3495
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
3496
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3497
  {
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3498
  	struct mem_cgroup *iter;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3499
  	for_each_mem_cgroup_tree(iter, memcg)
7d74b06f2   KAMEZAWA Hiroyuki   memcg: use for_ea...
3500
  		mem_cgroup_oom_notify_cb(iter);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3501
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3502
  static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3503
  	struct eventfd_ctx *eventfd, const char *args, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3504
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3505
3506
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3507
3508
  	unsigned long threshold;
  	unsigned long usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3509
  	int i, size, ret;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3510

650c5e565   Johannes Weiner   mm: page_counter:...
3511
  	ret = page_counter_memparse(args, "-1", &threshold);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3512
3513
3514
3515
  	if (ret)
  		return ret;
  
  	mutex_lock(&memcg->thresholds_lock);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3516

05b843012   Johannes Weiner   mm: memcontrol: u...
3517
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3518
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3519
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
3520
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3521
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3522
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
3523
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3524
  		BUG();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3525
  	/* Check if a threshold crossed before adding a new one */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3526
  	if (thresholds->primary)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3527
  		__mem_cgroup_threshold(memcg, type == _MEMSWAP);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3528
  	size = thresholds->primary ? thresholds->primary->size + 1 : 1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3529
3530
  
  	/* Allocate memory for new array of thresholds */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3531
  	new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3532
  			GFP_KERNEL);
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3533
  	if (!new) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3534
3535
3536
  		ret = -ENOMEM;
  		goto unlock;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3537
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3538
3539
  
  	/* Copy thresholds (if any) to new array */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3540
3541
  	if (thresholds->primary) {
  		memcpy(new->entries, thresholds->primary->entries, (size - 1) *
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3542
  				sizeof(struct mem_cgroup_threshold));
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3543
  	}
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3544
  	/* Add new threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3545
3546
  	new->entries[size - 1].eventfd = eventfd;
  	new->entries[size - 1].threshold = threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3547
3548
  
  	/* Sort thresholds. Registering of new threshold isn't time-critical */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3549
  	sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3550
3551
3552
  			compare_thresholds, NULL);
  
  	/* Find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3553
  	new->current_threshold = -1;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3554
  	for (i = 0; i < size; i++) {
748dad36d   Sha Zhengju   memcg: make thres...
3555
  		if (new->entries[i].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3556
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3557
3558
  			 * new->current_threshold will not be used until
  			 * rcu_assign_pointer(), so it's safe to increment
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3559
3560
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3561
  			++new->current_threshold;
748dad36d   Sha Zhengju   memcg: make thres...
3562
3563
  		} else
  			break;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3564
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3565
3566
3567
3568
3569
  	/* Free old spare buffer and save old primary buffer as spare */
  	kfree(thresholds->spare);
  	thresholds->spare = thresholds->primary;
  
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3570

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3571
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3572
  	synchronize_rcu();
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3573
3574
3575
3576
3577
  unlock:
  	mutex_unlock(&memcg->thresholds_lock);
  
  	return ret;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3578
  static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3579
3580
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3581
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
3582
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3583
  static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3584
3585
  	struct eventfd_ctx *eventfd, const char *args)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3586
  	return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
3587
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3588
  static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3589
  	struct eventfd_ctx *eventfd, enum res_type type)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3590
  {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3591
3592
  	struct mem_cgroup_thresholds *thresholds;
  	struct mem_cgroup_threshold_ary *new;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
3593
  	unsigned long usage;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3594
  	int i, j, size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3595
3596
  
  	mutex_lock(&memcg->thresholds_lock);
05b843012   Johannes Weiner   mm: memcontrol: u...
3597
3598
  
  	if (type == _MEM) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3599
  		thresholds = &memcg->thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3600
  		usage = mem_cgroup_usage(memcg, false);
05b843012   Johannes Weiner   mm: memcontrol: u...
3601
  	} else if (type == _MEMSWAP) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3602
  		thresholds = &memcg->memsw_thresholds;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
3603
  		usage = mem_cgroup_usage(memcg, true);
05b843012   Johannes Weiner   mm: memcontrol: u...
3604
  	} else
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3605
  		BUG();
371528cae   Anton Vorontsov   mm: memcg: Correc...
3606
3607
  	if (!thresholds->primary)
  		goto unlock;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3608
3609
3610
3611
  	/* Check if a threshold crossed before removing */
  	__mem_cgroup_threshold(memcg, type == _MEMSWAP);
  
  	/* Calculate new number of threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3612
3613
3614
  	size = 0;
  	for (i = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd != eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3615
3616
  			size++;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3617
  	new = thresholds->spare;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3618

2e72b6347   Kirill A. Shutemov   memcg: implement ...
3619
3620
  	/* Set thresholds array to NULL if we don't have thresholds */
  	if (!size) {
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3621
3622
  		kfree(new);
  		new = NULL;
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3623
  		goto swap_buffers;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3624
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3625
  	new->size = size;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3626
3627
  
  	/* Copy thresholds and find current threshold */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3628
3629
3630
  	new->current_threshold = -1;
  	for (i = 0, j = 0; i < thresholds->primary->size; i++) {
  		if (thresholds->primary->entries[i].eventfd == eventfd)
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3631
  			continue;
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3632
  		new->entries[j] = thresholds->primary->entries[i];
748dad36d   Sha Zhengju   memcg: make thres...
3633
  		if (new->entries[j].threshold <= usage) {
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3634
  			/*
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3635
  			 * new->current_threshold will not be used
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3636
3637
3638
  			 * until rcu_assign_pointer(), so it's safe to increment
  			 * it here.
  			 */
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3639
  			++new->current_threshold;
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3640
3641
3642
  		}
  		j++;
  	}
907860ed3   Kirill A. Shutemov   cgroups: make cft...
3643
  swap_buffers:
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3644
3645
  	/* Swap primary and spare array */
  	thresholds->spare = thresholds->primary;
8c7577637   Sha Zhengju   memcg: free spare...
3646
3647
3648
3649
3650
  	/* If all events are unregistered, free the spare array */
  	if (!new) {
  		kfree(thresholds->spare);
  		thresholds->spare = NULL;
  	}
2c488db27   Kirill A. Shutemov   memcg: clean up m...
3651
  	rcu_assign_pointer(thresholds->primary, new);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3652

907860ed3   Kirill A. Shutemov   cgroups: make cft...
3653
  	/* To be sure that nobody uses thresholds */
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3654
  	synchronize_rcu();
371528cae   Anton Vorontsov   mm: memcg: Correc...
3655
  unlock:
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3656
  	mutex_unlock(&memcg->thresholds_lock);
2e72b6347   Kirill A. Shutemov   memcg: implement ...
3657
  }
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
3658

59b6f8734   Tejun Heo   memcg: make cgrou...
3659
  static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3660
3661
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3662
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
347c4a874   Tejun Heo   memcg: remove cgr...
3663
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3664
  static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3665
3666
  	struct eventfd_ctx *eventfd)
  {
59b6f8734   Tejun Heo   memcg: make cgrou...
3667
  	return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
347c4a874   Tejun Heo   memcg: remove cgr...
3668
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3669
  static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3670
  	struct eventfd_ctx *eventfd, const char *args)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3671
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3672
  	struct mem_cgroup_eventfd_list *event;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3673

9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3674
3675
3676
  	event = kmalloc(sizeof(*event),	GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
1af8efe96   Michal Hocko   memcg: change mem...
3677
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3678
3679
3680
3681
3682
  
  	event->eventfd = eventfd;
  	list_add(&event->list, &memcg->oom_notify);
  
  	/* already in OOM ? */
c2b42d3ca   Tejun Heo   memcg: convert me...
3683
  	if (memcg->under_oom)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3684
  		eventfd_signal(eventfd, 1);
1af8efe96   Michal Hocko   memcg: change mem...
3685
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3686
3687
3688
  
  	return 0;
  }
59b6f8734   Tejun Heo   memcg: make cgrou...
3689
  static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
347c4a874   Tejun Heo   memcg: remove cgr...
3690
  	struct eventfd_ctx *eventfd)
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3691
  {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3692
  	struct mem_cgroup_eventfd_list *ev, *tmp;
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3693

1af8efe96   Michal Hocko   memcg: change mem...
3694
  	spin_lock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3695

c0ff4b854   Raghavendra K T   memcg: rename mem...
3696
  	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3697
3698
3699
3700
3701
  		if (ev->eventfd == eventfd) {
  			list_del(&ev->list);
  			kfree(ev);
  		}
  	}
1af8efe96   Michal Hocko   memcg: change mem...
3702
  	spin_unlock(&memcg_oom_lock);
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
3703
  }
2da8ca822   Tejun Heo   cgroup: replace c...
3704
  static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3705
  {
2da8ca822   Tejun Heo   cgroup: replace c...
3706
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3707

791badbdb   Tejun Heo   memcg: convert aw...
3708
3709
  	seq_printf(sf, "oom_kill_disable %d
  ", memcg->oom_kill_disable);
c2b42d3ca   Tejun Heo   memcg: convert me...
3710
3711
  	seq_printf(sf, "under_oom %d
  ", (bool)memcg->under_oom);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3712
3713
  	return 0;
  }
182446d08   Tejun Heo   cgroup: pass arou...
3714
  static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3715
3716
  	struct cftype *cft, u64 val)
  {
182446d08   Tejun Heo   cgroup: pass arou...
3717
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3718
3719
  
  	/* cannot set to root cgroup and only 0 and 1 are allowed */
14208b0ec   Linus Torvalds   Merge branch 'for...
3720
  	if (!css->parent || !((val == 0) || (val == 1)))
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3721
  		return -EINVAL;
c0ff4b854   Raghavendra K T   memcg: rename mem...
3722
  	memcg->oom_kill_disable = val;
4d845ebf4   KAMEZAWA Hiroyuki   memcg: fix wake u...
3723
  	if (!val)
c0ff4b854   Raghavendra K T   memcg: rename mem...
3724
  		memcg_oom_recover(memcg);
3dae7fec5   Johannes Weiner   mm: memcontrol: r...
3725

3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
3726
3727
  	return 0;
  }
c255a4580   Andrew Morton   memcg: rename con...
3728
  #ifdef CONFIG_MEMCG_KMEM
cbe128e34   Glauber Costa   cgroup: get rid o...
3729
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
3730
  {
55007d849   Glauber Costa   memcg: allocate m...
3731
  	int ret;
55007d849   Glauber Costa   memcg: allocate m...
3732
3733
3734
  	ret = memcg_propagate_kmem(memcg);
  	if (ret)
  		return ret;
2633d7a02   Glauber Costa   slab/slub: consid...
3735

1d62e4365   Glauber Costa   cgroup: pass stru...
3736
  	return mem_cgroup_sockets_init(memcg, ss);
573b400d0   Michel Lespinasse   mm/memcontrol.c: ...
3737
  }
e5671dfae   Glauber Costa   Basic kernel memo...
3738

2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3739
3740
  static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
  {
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3741
3742
3743
  	struct cgroup_subsys_state *css;
  	struct mem_cgroup *parent, *child;
  	int kmemcg_id;
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
  	if (!memcg->kmem_acct_active)
  		return;
  
  	/*
  	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
  	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
  	 * guarantees no cache will be created for this cgroup after we are
  	 * done (see memcg_create_kmem_cache()).
  	 */
  	memcg->kmem_acct_active = false;
  
  	memcg_deactivate_kmem_caches(memcg);
2788cf0c4   Vladimir Davydov   memcg: reparent l...
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
  
  	kmemcg_id = memcg->kmemcg_id;
  	BUG_ON(kmemcg_id < 0);
  
  	parent = parent_mem_cgroup(memcg);
  	if (!parent)
  		parent = root_mem_cgroup;
  
  	/*
  	 * Change kmemcg_id of this cgroup and all its descendants to the
  	 * parent's id, and then move all entries from this cgroup's list_lrus
  	 * to ones of the parent. After we have finished, all list_lrus
  	 * corresponding to this cgroup are guaranteed to remain empty. The
  	 * ordering is imposed by list_lru_node->lock taken by
  	 * memcg_drain_all_list_lrus().
  	 */
  	css_for_each_descendant_pre(css, &memcg->css) {
  		child = mem_cgroup_from_css(css);
  		BUG_ON(child->kmemcg_id != kmemcg_id);
  		child->kmemcg_id = parent->kmemcg_id;
  		if (!memcg->use_hierarchy)
  			break;
  	}
  	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
  
  	memcg_free_cache_id(kmemcg_id);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3782
  }
10d5ebf40   Li Zefan   memcg: use css_ge...
3783
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
d1a4c0b37   Glauber Costa   tcp memory pressu...
3784
  {
f48b80a5e   Vladimir Davydov   memcg: cleanup st...
3785
3786
3787
3788
3789
  	if (memcg->kmem_acct_activated) {
  		memcg_destroy_kmem_caches(memcg);
  		static_key_slow_dec(&memcg_kmem_enabled_key);
  		WARN_ON(page_counter_read(&memcg->kmem));
  	}
1d62e4365   Glauber Costa   cgroup: pass stru...
3790
  	mem_cgroup_sockets_destroy(memcg);
10d5ebf40   Li Zefan   memcg: use css_ge...
3791
  }
e5671dfae   Glauber Costa   Basic kernel memo...
3792
  #else
cbe128e34   Glauber Costa   cgroup: get rid o...
3793
  static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
e5671dfae   Glauber Costa   Basic kernel memo...
3794
3795
3796
  {
  	return 0;
  }
d1a4c0b37   Glauber Costa   tcp memory pressu...
3797

2a4db7eb9   Vladimir Davydov   memcg: free memcg...
3798
3799
3800
  static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
  {
  }
10d5ebf40   Li Zefan   memcg: use css_ge...
3801
3802
3803
  static void memcg_destroy_kmem(struct mem_cgroup *memcg)
  {
  }
e5671dfae   Glauber Costa   Basic kernel memo...
3804
  #endif
52ebea749   Tejun Heo   writeback: make b...
3805
3806
3807
3808
3809
3810
  #ifdef CONFIG_CGROUP_WRITEBACK
  
  struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
  {
  	return &memcg->cgwb_list;
  }
841710aa6   Tejun Heo   writeback: implem...
3811
3812
3813
3814
3815
3816
3817
3818
3819
  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
  {
  	return wb_domain_init(&memcg->cgwb_domain, gfp);
  }
  
  static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
  {
  	wb_domain_exit(&memcg->cgwb_domain);
  }
2529bb3aa   Tejun Heo   writeback: reset ...
3820
3821
3822
3823
  static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
  {
  	wb_domain_size_changed(&memcg->cgwb_domain);
  }
841710aa6   Tejun Heo   writeback: implem...
3824
3825
3826
3827
3828
3829
3830
3831
3832
  struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
  
  	if (!memcg->css.parent)
  		return NULL;
  
  	return &memcg->cgwb_domain;
  }
c2aa723a6   Tejun Heo   writeback: implem...
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
  /**
   * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
   * @wb: bdi_writeback in question
   * @pavail: out parameter for number of available pages
   * @pdirty: out parameter for number of dirty pages
   * @pwriteback: out parameter for number of pages under writeback
   *
   * Determine the numbers of available, dirty, and writeback pages in @wb's
   * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
   * more involved.
   *
   * A memcg's headroom is "min(max, high) - used".  The available memory is
   * calculated as the lowest headroom of itself and the ancestors plus the
   * number of pages already being used for file pages.  Note that this
   * doesn't consider the actual amount of available memory in the system.
   * The caller should further cap *@pavail accordingly.
   */
  void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
  			 unsigned long *pdirty, unsigned long *pwriteback)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
  	struct mem_cgroup *parent;
  	unsigned long head_room = PAGE_COUNTER_MAX;
  	unsigned long file_pages;
  
  	*pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
  
  	/* this should eventually include NR_UNSTABLE_NFS */
  	*pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
  
  	file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
  						    (1 << LRU_ACTIVE_FILE));
  	while ((parent = parent_mem_cgroup(memcg))) {
  		unsigned long ceiling = min(memcg->memory.limit, memcg->high);
  		unsigned long used = page_counter_read(&memcg->memory);
  
  		head_room = min(head_room, ceiling - min(ceiling, used));
  		memcg = parent;
  	}
  
  	*pavail = file_pages + head_room;
  }
841710aa6   Tejun Heo   writeback: implem...
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
  #else	/* CONFIG_CGROUP_WRITEBACK */
  
  static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
  {
  	return 0;
  }
  
  static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
  {
  }
2529bb3aa   Tejun Heo   writeback: reset ...
3885
3886
3887
  static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
  {
  }
52ebea749   Tejun Heo   writeback: make b...
3888
  #endif	/* CONFIG_CGROUP_WRITEBACK */
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3889
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
   * DO NOT USE IN NEW FILES.
   *
   * "cgroup.event_control" implementation.
   *
   * This is way over-engineered.  It tries to support fully configurable
   * events for each user.  Such level of flexibility is completely
   * unnecessary especially in the light of the planned unified hierarchy.
   *
   * Please deprecate this and replace with something simpler if at all
   * possible.
   */
  
  /*
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3903
3904
3905
3906
   * Unregister event and free resources.
   *
   * Gets called from workqueue.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
3907
  static void memcg_event_remove(struct work_struct *work)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3908
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3909
3910
  	struct mem_cgroup_event *event =
  		container_of(work, struct mem_cgroup_event, remove);
59b6f8734   Tejun Heo   memcg: make cgrou...
3911
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3912
3913
  
  	remove_wait_queue(event->wqh, &event->wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
3914
  	event->unregister_event(memcg, event->eventfd);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3915
3916
3917
3918
3919
3920
  
  	/* Notify userspace the event is going away. */
  	eventfd_signal(event->eventfd, 1);
  
  	eventfd_ctx_put(event->eventfd);
  	kfree(event);
59b6f8734   Tejun Heo   memcg: make cgrou...
3921
  	css_put(&memcg->css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3922
3923
3924
3925
3926
3927
3928
  }
  
  /*
   * Gets called on POLLHUP on eventfd when user closes it.
   *
   * Called with wqh->lock held and interrupts disabled.
   */
3bc942f37   Tejun Heo   memcg: rename cgr...
3929
3930
  static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
  			    int sync, void *key)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3931
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3932
3933
  	struct mem_cgroup_event *event =
  		container_of(wait, struct mem_cgroup_event, wait);
59b6f8734   Tejun Heo   memcg: make cgrou...
3934
  	struct mem_cgroup *memcg = event->memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
  	unsigned long flags = (unsigned long)key;
  
  	if (flags & POLLHUP) {
  		/*
  		 * If the event has been detached at cgroup removal, we
  		 * can simply return knowing the other side will cleanup
  		 * for us.
  		 *
  		 * We can't race against event freeing since the other
  		 * side will require wqh->lock via remove_wait_queue(),
  		 * which we hold.
  		 */
fba948078   Tejun Heo   cgroup, memcg: mo...
3947
  		spin_lock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3948
3949
3950
3951
3952
3953
3954
3955
  		if (!list_empty(&event->list)) {
  			list_del_init(&event->list);
  			/*
  			 * We are in atomic context, but cgroup_event_remove()
  			 * may sleep, so we have to call it in workqueue.
  			 */
  			schedule_work(&event->remove);
  		}
fba948078   Tejun Heo   cgroup, memcg: mo...
3956
  		spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3957
3958
3959
3960
  	}
  
  	return 0;
  }
3bc942f37   Tejun Heo   memcg: rename cgr...
3961
  static void memcg_event_ptable_queue_proc(struct file *file,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3962
3963
  		wait_queue_head_t *wqh, poll_table *pt)
  {
3bc942f37   Tejun Heo   memcg: rename cgr...
3964
3965
  	struct mem_cgroup_event *event =
  		container_of(pt, struct mem_cgroup_event, pt);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3966
3967
3968
3969
3970
3971
  
  	event->wqh = wqh;
  	add_wait_queue(wqh, &event->wait);
  }
  
  /*
3bc942f37   Tejun Heo   memcg: rename cgr...
3972
3973
   * DO NOT USE IN NEW FILES.
   *
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3974
3975
3976
3977
3978
   * Parse input and register new cgroup event handler.
   *
   * Input must be in format '<event_fd> <control_fd> <args>'.
   * Interpretation of args is defined by control file implementation.
   */
451af504d   Tejun Heo   cgroup: replace c...
3979
3980
  static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
  					 char *buf, size_t nbytes, loff_t off)
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3981
  {
451af504d   Tejun Heo   cgroup: replace c...
3982
  	struct cgroup_subsys_state *css = of_css(of);
fba948078   Tejun Heo   cgroup, memcg: mo...
3983
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
3984
  	struct mem_cgroup_event *event;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3985
3986
3987
3988
  	struct cgroup_subsys_state *cfile_css;
  	unsigned int efd, cfd;
  	struct fd efile;
  	struct fd cfile;
fba948078   Tejun Heo   cgroup, memcg: mo...
3989
  	const char *name;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3990
3991
  	char *endp;
  	int ret;
451af504d   Tejun Heo   cgroup: replace c...
3992
3993
3994
  	buf = strstrip(buf);
  
  	efd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3995
3996
  	if (*endp != ' ')
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
3997
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
3998

451af504d   Tejun Heo   cgroup: replace c...
3999
  	cfd = simple_strtoul(buf, &endp, 10);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4000
4001
  	if ((*endp != ' ') && (*endp != '\0'))
  		return -EINVAL;
451af504d   Tejun Heo   cgroup: replace c...
4002
  	buf = endp + 1;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4003
4004
4005
4006
  
  	event = kzalloc(sizeof(*event), GFP_KERNEL);
  	if (!event)
  		return -ENOMEM;
59b6f8734   Tejun Heo   memcg: make cgrou...
4007
  	event->memcg = memcg;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4008
  	INIT_LIST_HEAD(&event->list);
3bc942f37   Tejun Heo   memcg: rename cgr...
4009
4010
4011
  	init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
  	init_waitqueue_func_entry(&event->wait, memcg_event_wake);
  	INIT_WORK(&event->remove, memcg_event_remove);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
  
  	efile = fdget(efd);
  	if (!efile.file) {
  		ret = -EBADF;
  		goto out_kfree;
  	}
  
  	event->eventfd = eventfd_ctx_fileget(efile.file);
  	if (IS_ERR(event->eventfd)) {
  		ret = PTR_ERR(event->eventfd);
  		goto out_put_efile;
  	}
  
  	cfile = fdget(cfd);
  	if (!cfile.file) {
  		ret = -EBADF;
  		goto out_put_eventfd;
  	}
  
  	/* the process need read permission on control file */
  	/* AV: shouldn't we check that it's been opened for read instead? */
  	ret = inode_permission(file_inode(cfile.file), MAY_READ);
  	if (ret < 0)
  		goto out_put_cfile;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4036
  	/*
fba948078   Tejun Heo   cgroup, memcg: mo...
4037
4038
4039
4040
  	 * Determine the event callbacks and set them in @event.  This used
  	 * to be done via struct cftype but cgroup core no longer knows
  	 * about these events.  The following is crude but the whole thing
  	 * is for compatibility anyway.
3bc942f37   Tejun Heo   memcg: rename cgr...
4041
4042
  	 *
  	 * DO NOT ADD NEW FILES.
fba948078   Tejun Heo   cgroup, memcg: mo...
4043
  	 */
b583043e9   Al Viro   kill f_dentry uses
4044
  	name = cfile.file->f_path.dentry->d_name.name;
fba948078   Tejun Heo   cgroup, memcg: mo...
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
  
  	if (!strcmp(name, "memory.usage_in_bytes")) {
  		event->register_event = mem_cgroup_usage_register_event;
  		event->unregister_event = mem_cgroup_usage_unregister_event;
  	} else if (!strcmp(name, "memory.oom_control")) {
  		event->register_event = mem_cgroup_oom_register_event;
  		event->unregister_event = mem_cgroup_oom_unregister_event;
  	} else if (!strcmp(name, "memory.pressure_level")) {
  		event->register_event = vmpressure_register_event;
  		event->unregister_event = vmpressure_unregister_event;
  	} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
347c4a874   Tejun Heo   memcg: remove cgr...
4056
4057
  		event->register_event = memsw_cgroup_usage_register_event;
  		event->unregister_event = memsw_cgroup_usage_unregister_event;
fba948078   Tejun Heo   cgroup, memcg: mo...
4058
4059
4060
4061
4062
4063
  	} else {
  		ret = -EINVAL;
  		goto out_put_cfile;
  	}
  
  	/*
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
4064
4065
4066
  	 * Verify @cfile should belong to @css.  Also, remaining events are
  	 * automatically removed on cgroup destruction but the removal is
  	 * asynchronous, so take an extra ref on @css.
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4067
  	 */
b583043e9   Al Viro   kill f_dentry uses
4068
  	cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
ec903c0c8   Tejun Heo   cgroup: rename cs...
4069
  					       &memory_cgrp_subsys);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4070
  	ret = -EINVAL;
5a17f543e   Tejun Heo   cgroup: improve c...
4071
  	if (IS_ERR(cfile_css))
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4072
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
4073
4074
  	if (cfile_css != css) {
  		css_put(cfile_css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4075
  		goto out_put_cfile;
5a17f543e   Tejun Heo   cgroup: improve c...
4076
  	}
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4077

451af504d   Tejun Heo   cgroup: replace c...
4078
  	ret = event->register_event(memcg, event->eventfd, buf);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4079
4080
4081
4082
  	if (ret)
  		goto out_put_css;
  
  	efile.file->f_op->poll(efile.file, &event->pt);
fba948078   Tejun Heo   cgroup, memcg: mo...
4083
4084
4085
  	spin_lock(&memcg->event_list_lock);
  	list_add(&event->list, &memcg->event_list);
  	spin_unlock(&memcg->event_list_lock);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4086
4087
4088
  
  	fdput(cfile);
  	fdput(efile);
451af504d   Tejun Heo   cgroup: replace c...
4089
  	return nbytes;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4090
4091
  
  out_put_css:
b5557c4c3   Tejun Heo   memcg: cgroup_wri...
4092
  	css_put(css);
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
  out_put_cfile:
  	fdput(cfile);
  out_put_eventfd:
  	eventfd_ctx_put(event->eventfd);
  out_put_efile:
  	fdput(efile);
  out_kfree:
  	kfree(event);
  
  	return ret;
  }
241994ed8   Johannes Weiner   mm: memcontrol: d...
4104
  static struct cftype mem_cgroup_legacy_files[] = {
8cdea7c05   Balbir Singh   Memory controller...
4105
  	{
0eea10301   Balbir Singh   Memory controller...
4106
  		.name = "usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4107
  		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
4108
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4109
4110
  	},
  	{
c84872e16   Pavel Emelyanov   memcgroup: add th...
4111
  		.name = "max_usage_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4112
  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
4113
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4114
  		.read_u64 = mem_cgroup_read_u64,
c84872e16   Pavel Emelyanov   memcgroup: add th...
4115
4116
  	},
  	{
0eea10301   Balbir Singh   Memory controller...
4117
  		.name = "limit_in_bytes",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4118
  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4119
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4120
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4121
4122
  	},
  	{
296c81d89   Balbir Singh   memory controller...
4123
4124
  		.name = "soft_limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4125
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4126
  		.read_u64 = mem_cgroup_read_u64,
296c81d89   Balbir Singh   memory controller...
4127
4128
  	},
  	{
8cdea7c05   Balbir Singh   Memory controller...
4129
  		.name = "failcnt",
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4130
  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
4131
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4132
  		.read_u64 = mem_cgroup_read_u64,
8cdea7c05   Balbir Singh   Memory controller...
4133
  	},
8697d3319   Balbir Singh   Memory controller...
4134
  	{
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4135
  		.name = "stat",
2da8ca822   Tejun Heo   cgroup: replace c...
4136
  		.seq_show = memcg_stat_show,
d2ceb9b7d   KAMEZAWA Hiroyuki   memory cgroup enh...
4137
  	},
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4138
4139
  	{
  		.name = "force_empty",
6770c64e5   Tejun Heo   cgroup: replace c...
4140
  		.write = mem_cgroup_force_empty_write,
c1e862c1f   KAMEZAWA Hiroyuki   memcg: new force_...
4141
  	},
18f59ea7d   Balbir Singh   memcg: memory cgr...
4142
4143
4144
4145
4146
  	{
  		.name = "use_hierarchy",
  		.write_u64 = mem_cgroup_hierarchy_write,
  		.read_u64 = mem_cgroup_hierarchy_read,
  	},
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4147
  	{
3bc942f37   Tejun Heo   memcg: rename cgr...
4148
  		.name = "cgroup.event_control",		/* XXX: for compat */
451af504d   Tejun Heo   cgroup: replace c...
4149
  		.write = memcg_write_event_control,
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4150
4151
4152
4153
  		.flags = CFTYPE_NO_PREFIX,
  		.mode = S_IWUGO,
  	},
  	{
a7885eb8a   KOSAKI Motohiro   memcg: swappiness
4154
4155
4156
4157
  		.name = "swappiness",
  		.read_u64 = mem_cgroup_swappiness_read,
  		.write_u64 = mem_cgroup_swappiness_write,
  	},
7dc74be03   Daisuke Nishimura   memcg: add interf...
4158
4159
4160
4161
4162
  	{
  		.name = "move_charge_at_immigrate",
  		.read_u64 = mem_cgroup_move_charge_read,
  		.write_u64 = mem_cgroup_move_charge_write,
  	},
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4163
4164
  	{
  		.name = "oom_control",
2da8ca822   Tejun Heo   cgroup: replace c...
4165
  		.seq_show = mem_cgroup_oom_control_read,
3c11ecf44   KAMEZAWA Hiroyuki   memcg: oom kill d...
4166
  		.write_u64 = mem_cgroup_oom_control_write,
9490ff275   KAMEZAWA Hiroyuki   memcg: oom notifier
4167
4168
  		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
  	},
70ddf637e   Anton Vorontsov   memcg: add memory...
4169
4170
  	{
  		.name = "pressure_level",
70ddf637e   Anton Vorontsov   memcg: add memory...
4171
  	},
406eb0c9b   Ying Han   memcg: add memory...
4172
4173
4174
  #ifdef CONFIG_NUMA
  	{
  		.name = "numa_stat",
2da8ca822   Tejun Heo   cgroup: replace c...
4175
  		.seq_show = memcg_numa_stat_show,
406eb0c9b   Ying Han   memcg: add memory...
4176
4177
  	},
  #endif
510fc4e11   Glauber Costa   memcg: kmem accou...
4178
4179
4180
4181
  #ifdef CONFIG_MEMCG_KMEM
  	{
  		.name = "kmem.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
451af504d   Tejun Heo   cgroup: replace c...
4182
  		.write = mem_cgroup_write,
791badbdb   Tejun Heo   memcg: convert aw...
4183
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4184
4185
4186
4187
  	},
  	{
  		.name = "kmem.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
791badbdb   Tejun Heo   memcg: convert aw...
4188
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4189
4190
4191
4192
  	},
  	{
  		.name = "kmem.failcnt",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6770c64e5   Tejun Heo   cgroup: replace c...
4193
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4194
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4195
4196
4197
4198
  	},
  	{
  		.name = "kmem.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6770c64e5   Tejun Heo   cgroup: replace c...
4199
  		.write = mem_cgroup_reset,
791badbdb   Tejun Heo   memcg: convert aw...
4200
  		.read_u64 = mem_cgroup_read_u64,
510fc4e11   Glauber Costa   memcg: kmem accou...
4201
  	},
749c54151   Glauber Costa   memcg: aggregate ...
4202
4203
4204
  #ifdef CONFIG_SLABINFO
  	{
  		.name = "kmem.slabinfo",
b047501cd   Vladimir Davydov   memcg: use generi...
4205
4206
4207
4208
  		.seq_start = slab_start,
  		.seq_next = slab_next,
  		.seq_stop = slab_stop,
  		.seq_show = memcg_slab_show,
749c54151   Glauber Costa   memcg: aggregate ...
4209
4210
  	},
  #endif
510fc4e11   Glauber Costa   memcg: kmem accou...
4211
  #endif
6bc103498   Tejun Heo   cgroup: convert m...
4212
  	{ },	/* terminate */
af36f906c   Tejun Heo   memcg: always cre...
4213
  };
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4214

c0ff4b854   Raghavendra K T   memcg: rename mem...
4215
  static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4216
4217
  {
  	struct mem_cgroup_per_node *pn;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4218
  	struct mem_cgroup_per_zone *mz;
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
4219
  	int zone, tmp = node;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4220
4221
4222
4223
4224
4225
4226
4227
  	/*
  	 * This routine is called against possible nodes.
  	 * But it's BUG to call kmalloc() against offline node.
  	 *
  	 * TODO: this routine can waste much memory for nodes which will
  	 *       never be onlined. It's better to use memory hotplug callback
  	 *       function.
  	 */
41e3355de   KAMEZAWA Hiroyuki   memcg: fix node_s...
4228
4229
  	if (!node_state(node, N_NORMAL_MEMORY))
  		tmp = -1;
17295c88a   Jesper Juhl   memcg: use [kv]za...
4230
  	pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4231
4232
  	if (!pn)
  		return 1;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4233

1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4234
4235
  	for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  		mz = &pn->zoneinfo[zone];
bea8c150a   Hugh Dickins   memcg: fix hotplu...
4236
  		lruvec_init(&mz->lruvec);
bb4cc1a8b   Andrew Morton   revert "memcg: ge...
4237
4238
  		mz->usage_in_excess = 0;
  		mz->on_tree = false;
d79154bb5   Hugh Dickins   memcg: replace me...
4239
  		mz->memcg = memcg;
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4240
  	}
54f72fe02   Johannes Weiner   memcg: clean up m...
4241
  	memcg->nodeinfo[node] = pn;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4242
4243
  	return 0;
  }
c0ff4b854   Raghavendra K T   memcg: rename mem...
4244
  static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4245
  {
54f72fe02   Johannes Weiner   memcg: clean up m...
4246
  	kfree(memcg->nodeinfo[node]);
1ecaab2bd   KAMEZAWA Hiroyuki   per-zone and recl...
4247
  }
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4248
4249
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
d79154bb5   Hugh Dickins   memcg: replace me...
4250
  	struct mem_cgroup *memcg;
8ff69e2c8   Vladimir Davydov   memcg: do not use...
4251
  	size_t size;
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4252

8ff69e2c8   Vladimir Davydov   memcg: do not use...
4253
4254
  	size = sizeof(struct mem_cgroup);
  	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4255

8ff69e2c8   Vladimir Davydov   memcg: do not use...
4256
  	memcg = kzalloc(size, GFP_KERNEL);
d79154bb5   Hugh Dickins   memcg: replace me...
4257
  	if (!memcg)
e7bbcdf37   Dan Carpenter   memcontrol: fix p...
4258
  		return NULL;
d79154bb5   Hugh Dickins   memcg: replace me...
4259
4260
  	memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
  	if (!memcg->stat)
d2e61b8dc   Dan Carpenter   memcg: null deref...
4261
  		goto out_free;
841710aa6   Tejun Heo   writeback: implem...
4262
4263
4264
  
  	if (memcg_wb_domain_init(memcg, GFP_KERNEL))
  		goto out_free_stat;
d79154bb5   Hugh Dickins   memcg: replace me...
4265
4266
  	spin_lock_init(&memcg->pcp_counter_lock);
  	return memcg;
d2e61b8dc   Dan Carpenter   memcg: null deref...
4267

841710aa6   Tejun Heo   writeback: implem...
4268
4269
  out_free_stat:
  	free_percpu(memcg->stat);
d2e61b8dc   Dan Carpenter   memcg: null deref...
4270
  out_free:
8ff69e2c8   Vladimir Davydov   memcg: do not use...
4271
  	kfree(memcg);
d2e61b8dc   Dan Carpenter   memcg: null deref...
4272
  	return NULL;
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4273
  }
8c7c6e34a   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4274
  /*
c8b2a36fb   Glauber Costa   memcg: execute th...
4275
4276
4277
4278
4279
4280
4281
4282
   * At destroying mem_cgroup, references from swap_cgroup can remain.
   * (scanning all at force_empty is too costly...)
   *
   * Instead of clearing all references at force_empty, we remember
   * the number of reference from swap_cgroup and free mem_cgroup when
   * it goes down to 0.
   *
   * Removal of cgroup itself succeeds regardless of refs from swap.
59927fb98   Hugh Dickins   memcg: free mem_c...
4283
   */
c8b2a36fb   Glauber Costa   memcg: execute th...
4284
4285
  
  static void __mem_cgroup_free(struct mem_cgroup *memcg)
59927fb98   Hugh Dickins   memcg: free mem_c...
4286
  {
c8b2a36fb   Glauber Costa   memcg: execute th...
4287
  	int node;
59927fb98   Hugh Dickins   memcg: free mem_c...
4288

bb4cc1a8b   Andrew Morton   revert "memcg: ge...
4289
  	mem_cgroup_remove_from_trees(memcg);
c8b2a36fb   Glauber Costa   memcg: execute th...
4290
4291
4292
4293
4294
  
  	for_each_node(node)
  		free_mem_cgroup_per_zone_info(memcg, node);
  
  	free_percpu(memcg->stat);
841710aa6   Tejun Heo   writeback: implem...
4295
  	memcg_wb_domain_exit(memcg);
8ff69e2c8   Vladimir Davydov   memcg: do not use...
4296
  	kfree(memcg);
59927fb98   Hugh Dickins   memcg: free mem_c...
4297
  }
3afe36b1f   Glauber Costa   memcg: always fre...
4298

7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4299
4300
4301
  /*
   * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
   */
e1aab161e   Glauber Costa   socket: initial c...
4302
  struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4303
  {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4304
  	if (!memcg->memory.parent)
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4305
  		return NULL;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4306
  	return mem_cgroup_from_counter(memcg->memory.parent, memory);
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4307
  }
e1aab161e   Glauber Costa   socket: initial c...
4308
  EXPORT_SYMBOL(parent_mem_cgroup);
333279487   KAMEZAWA Hiroyuki   memcgroup: use vm...
4309

0eb253e22   Li Zefan   memcg: fix sectio...
4310
  static struct cgroup_subsys_state * __ref
eb95419b0   Tejun Heo   cgroup: pass arou...
4311
  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8cdea7c05   Balbir Singh   Memory controller...
4312
  {
d142e3e66   Glauber Costa   memcg: split part...
4313
  	struct mem_cgroup *memcg;
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4314
  	long error = -ENOMEM;
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4315
  	int node;
8cdea7c05   Balbir Singh   Memory controller...
4316

c0ff4b854   Raghavendra K T   memcg: rename mem...
4317
4318
  	memcg = mem_cgroup_alloc();
  	if (!memcg)
04046e1a0   KAMEZAWA Hiroyuki   memcg: use CSS ID
4319
  		return ERR_PTR(error);
78fb74669   Pavel Emelianov   Memory controller...
4320

3ed28fa10   Bob Liu   memcg: cleanup fo...
4321
  	for_each_node(node)
c0ff4b854   Raghavendra K T   memcg: rename mem...
4322
  		if (alloc_mem_cgroup_per_zone_info(memcg, node))
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
4323
  			goto free_out;
f64c3f549   Balbir Singh   memory controller...
4324

c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
4325
  	/* root ? */
eb95419b0   Tejun Heo   cgroup: pass arou...
4326
  	if (parent_css == NULL) {
a41c58a66   Hillf Danton   memcg: keep root ...
4327
  		root_mem_cgroup = memcg;
56161634e   Tejun Heo   memcg: add mem_cg...
4328
  		mem_cgroup_root_css = &memcg->css;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4329
  		page_counter_init(&memcg->memory, NULL);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4330
  		memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4331
  		memcg->soft_limit = PAGE_COUNTER_MAX;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4332
4333
  		page_counter_init(&memcg->memsw, NULL);
  		page_counter_init(&memcg->kmem, NULL);
18f59ea7d   Balbir Singh   memcg: memory cgr...
4334
  	}
28dbc4b6a   Balbir Singh   memcg: memory cgr...
4335

d142e3e66   Glauber Costa   memcg: split part...
4336
4337
  	memcg->last_scanned_node = MAX_NUMNODES;
  	INIT_LIST_HEAD(&memcg->oom_notify);
d142e3e66   Glauber Costa   memcg: split part...
4338
4339
4340
  	memcg->move_charge_at_immigrate = 0;
  	mutex_init(&memcg->thresholds_lock);
  	spin_lock_init(&memcg->move_lock);
70ddf637e   Anton Vorontsov   memcg: add memory...
4341
  	vmpressure_init(&memcg->vmpressure);
fba948078   Tejun Heo   cgroup, memcg: mo...
4342
4343
  	INIT_LIST_HEAD(&memcg->event_list);
  	spin_lock_init(&memcg->event_list_lock);
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
4344
4345
  #ifdef CONFIG_MEMCG_KMEM
  	memcg->kmemcg_id = -1;
900a38f02   Vladimir Davydov   memcg: zap kmem_a...
4346
  #endif
52ebea749   Tejun Heo   writeback: make b...
4347
4348
4349
  #ifdef CONFIG_CGROUP_WRITEBACK
  	INIT_LIST_HEAD(&memcg->cgwb_list);
  #endif
d142e3e66   Glauber Costa   memcg: split part...
4350
4351
4352
4353
4354
4355
4356
4357
  	return &memcg->css;
  
  free_out:
  	__mem_cgroup_free(memcg);
  	return ERR_PTR(error);
  }
  
  static int
eb95419b0   Tejun Heo   cgroup: pass arou...
4358
  mem_cgroup_css_online(struct cgroup_subsys_state *css)
d142e3e66   Glauber Costa   memcg: split part...
4359
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4360
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5c9d535b8   Tejun Heo   cgroup: remove cs...
4361
  	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
4362
  	int ret;
d142e3e66   Glauber Costa   memcg: split part...
4363

15a4c835e   Tejun Heo   cgroup, memcg: im...
4364
  	if (css->id > MEM_CGROUP_ID_MAX)
4219b2da2   Li Zefan   memcg: fail to cr...
4365
  		return -ENOSPC;
638769869   Tejun Heo   cgroup: add css_p...
4366
  	if (!parent)
d142e3e66   Glauber Costa   memcg: split part...
4367
  		return 0;
0999821b1   Glauber Costa   memcg: replace cg...
4368
  	mutex_lock(&memcg_create_mutex);
d142e3e66   Glauber Costa   memcg: split part...
4369
4370
4371
4372
4373
4374
  
  	memcg->use_hierarchy = parent->use_hierarchy;
  	memcg->oom_kill_disable = parent->oom_kill_disable;
  	memcg->swappiness = mem_cgroup_swappiness(parent);
  
  	if (parent->use_hierarchy) {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4375
  		page_counter_init(&memcg->memory, &parent->memory);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4376
  		memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4377
  		memcg->soft_limit = PAGE_COUNTER_MAX;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4378
4379
  		page_counter_init(&memcg->memsw, &parent->memsw);
  		page_counter_init(&memcg->kmem, &parent->kmem);
55007d849   Glauber Costa   memcg: allocate m...
4380

7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4381
  		/*
8d76a9797   Li Zefan   memcg: don't need...
4382
4383
  		 * No need to take a reference to the parent because cgroup
  		 * core guarantees its existence.
7bcc1bb12   Daisuke Nishimura   memcg: get/put pa...
4384
  		 */
18f59ea7d   Balbir Singh   memcg: memory cgr...
4385
  	} else {
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4386
  		page_counter_init(&memcg->memory, NULL);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4387
  		memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4388
  		memcg->soft_limit = PAGE_COUNTER_MAX;
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4389
4390
  		page_counter_init(&memcg->memsw, NULL);
  		page_counter_init(&memcg->kmem, NULL);
8c7f6edbd   Tejun Heo   cgroup: mark subs...
4391
4392
4393
4394
4395
  		/*
  		 * Deeper hierachy with use_hierarchy == false doesn't make
  		 * much sense so let cgroup subsystem know about this
  		 * unfortunate state in our controller.
  		 */
d142e3e66   Glauber Costa   memcg: split part...
4396
  		if (parent != root_mem_cgroup)
073219e99   Tejun Heo   cgroup: clean up ...
4397
  			memory_cgrp_subsys.broken_hierarchy = true;
18f59ea7d   Balbir Singh   memcg: memory cgr...
4398
  	}
0999821b1   Glauber Costa   memcg: replace cg...
4399
  	mutex_unlock(&memcg_create_mutex);
d64416377   Vladimir Davydov   memcg: rework mem...
4400

2f7dd7a41   Johannes Weiner   mm: memcontrol: d...
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
  	ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
  	if (ret)
  		return ret;
  
  	/*
  	 * Make sure the memcg is initialized: mem_cgroup_iter()
  	 * orders reading memcg->initialized against its callers
  	 * reading the memcg members.
  	 */
  	smp_store_release(&memcg->initialized, 1);
  
  	return 0;
8cdea7c05   Balbir Singh   Memory controller...
4413
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4414
  static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4415
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4416
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3bc942f37   Tejun Heo   memcg: rename cgr...
4417
  	struct mem_cgroup_event *event, *tmp;
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4418
4419
4420
4421
4422
4423
  
  	/*
  	 * Unregister events and notify userspace.
  	 * Notify userspace about cgroup removing only after rmdir of cgroup
  	 * directory to avoid race between userspace and kernelspace.
  	 */
fba948078   Tejun Heo   cgroup, memcg: mo...
4424
4425
  	spin_lock(&memcg->event_list_lock);
  	list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
79bd9814e   Tejun Heo   cgroup, memcg: mo...
4426
4427
4428
  		list_del_init(&event->list);
  		schedule_work(&event->remove);
  	}
fba948078   Tejun Heo   cgroup, memcg: mo...
4429
  	spin_unlock(&memcg->event_list_lock);
ec64f5154   KAMEZAWA Hiroyuki   cgroup: fix frequ...
4430

33cb876e9   Michal Hocko   vmpressure: make ...
4431
  	vmpressure_cleanup(&memcg->vmpressure);
2a4db7eb9   Vladimir Davydov   memcg: free memcg...
4432
4433
  
  	memcg_deactivate_kmem(memcg);
52ebea749   Tejun Heo   writeback: make b...
4434
4435
  
  	wb_memcg_offline(memcg);
df878fb04   KAMEZAWA Hiroyuki   memory cgroup enh...
4436
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4437
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
8cdea7c05   Balbir Singh   Memory controller...
4438
  {
eb95419b0   Tejun Heo   cgroup: pass arou...
4439
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
c268e9946   Daisuke Nishimura   memcg: fix hierar...
4440

10d5ebf40   Li Zefan   memcg: use css_ge...
4441
  	memcg_destroy_kmem(memcg);
465939a1f   Li Zefan   memcg: don't need...
4442
  	__mem_cgroup_free(memcg);
8cdea7c05   Balbir Singh   Memory controller...
4443
  }
1ced953b1   Tejun Heo   blkcg, memcg: mak...
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
  /**
   * mem_cgroup_css_reset - reset the states of a mem_cgroup
   * @css: the target css
   *
   * Reset the states of the mem_cgroup associated with @css.  This is
   * invoked when the userland requests disabling on the default hierarchy
   * but the memcg is pinned through dependency.  The memcg should stop
   * applying policies and should revert to the vanilla state as it may be
   * made visible again.
   *
   * The current implementation only resets the essential configurations.
   * This needs to be expanded to cover all the visible parts.
   */
  static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4460
4461
4462
  	mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
  	mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
  	memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
241994ed8   Johannes Weiner   mm: memcontrol: d...
4463
4464
  	memcg->low = 0;
  	memcg->high = PAGE_COUNTER_MAX;
24d404dc1   Johannes Weiner   mm: memcontrol: s...
4465
  	memcg->soft_limit = PAGE_COUNTER_MAX;
2529bb3aa   Tejun Heo   writeback: reset ...
4466
  	memcg_wb_domain_size_changed(memcg);
1ced953b1   Tejun Heo   blkcg, memcg: mak...
4467
  }
024914477   Daisuke Nishimura   memcg: move charg...
4468
  #ifdef CONFIG_MMU
7dc74be03   Daisuke Nishimura   memcg: add interf...
4469
  /* Handlers for move charge at task migration. */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4470
  static int mem_cgroup_do_precharge(unsigned long count)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4471
  {
05b843012   Johannes Weiner   mm: memcontrol: u...
4472
  	int ret;
9476db974   Johannes Weiner   mm: memcontrol: s...
4473
4474
  
  	/* Try a single bulk charge without reclaim first */
00501b531   Johannes Weiner   mm: memcontrol: r...
4475
  	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
9476db974   Johannes Weiner   mm: memcontrol: s...
4476
  	if (!ret) {
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4477
  		mc.precharge += count;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4478
4479
  		return ret;
  	}
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4480
  	if (ret == -EINTR) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4481
  		cancel_charge(root_mem_cgroup, count);
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4482
4483
  		return ret;
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
4484
4485
  
  	/* Try charges one by one with reclaim */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4486
  	while (count--) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4487
  		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
9476db974   Johannes Weiner   mm: memcontrol: s...
4488
4489
4490
  		/*
  		 * In case of failure, any residual charges against
  		 * mc.to will be dropped by mem_cgroup_clear_mc()
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4491
4492
  		 * later on.  However, cancel any charges that are
  		 * bypassed to root right away or they'll be lost.
9476db974   Johannes Weiner   mm: memcontrol: s...
4493
  		 */
692e7c45d   Johannes Weiner   mm: memcontrol: c...
4494
  		if (ret == -EINTR)
00501b531   Johannes Weiner   mm: memcontrol: r...
4495
  			cancel_charge(root_mem_cgroup, 1);
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4496
  		if (ret)
38c5d72f3   KAMEZAWA Hiroyuki   memcg: simplify L...
4497
  			return ret;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4498
  		mc.precharge++;
9476db974   Johannes Weiner   mm: memcontrol: s...
4499
  		cond_resched();
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4500
  	}
9476db974   Johannes Weiner   mm: memcontrol: s...
4501
  	return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4502
4503
4504
  }
  
  /**
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4505
   * get_mctgt_type - get target type of moving charge
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4506
4507
4508
   * @vma: the vma the pte to be checked belongs
   * @addr: the address corresponding to the pte to be checked
   * @ptent: the pte to be checked
024914477   Daisuke Nishimura   memcg: move charg...
4509
   * @target: the pointer the target page or swap ent will be stored(can be NULL)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4510
4511
4512
4513
4514
4515
   *
   * Returns
   *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
   *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
   *     move charge. if @target is not NULL, the page is stored in target->page
   *     with extra refcnt got(Callers should handle it).
024914477   Daisuke Nishimura   memcg: move charg...
4516
4517
4518
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4519
4520
4521
   *
   * Called with pte lock held.
   */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4522
4523
  union mc_target {
  	struct page	*page;
024914477   Daisuke Nishimura   memcg: move charg...
4524
  	swp_entry_t	ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4525
  };
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4526
  enum mc_target_type {
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4527
  	MC_TARGET_NONE = 0,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4528
  	MC_TARGET_PAGE,
024914477   Daisuke Nishimura   memcg: move charg...
4529
  	MC_TARGET_SWAP,
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4530
  };
90254a658   Daisuke Nishimura   memcg: clean up m...
4531
4532
  static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
  						unsigned long addr, pte_t ptent)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4533
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4534
  	struct page *page = vm_normal_page(vma, addr, ptent);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4535

90254a658   Daisuke Nishimura   memcg: clean up m...
4536
4537
4538
  	if (!page || !page_mapped(page))
  		return NULL;
  	if (PageAnon(page)) {
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4539
  		if (!(mc.flags & MOVE_ANON))
90254a658   Daisuke Nishimura   memcg: clean up m...
4540
  			return NULL;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4541
4542
4543
4544
  	} else {
  		if (!(mc.flags & MOVE_FILE))
  			return NULL;
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4545
4546
4547
4548
4549
  	if (!get_page_unless_zero(page))
  		return NULL;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4550
  #ifdef CONFIG_SWAP
90254a658   Daisuke Nishimura   memcg: clean up m...
4551
4552
4553
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
90254a658   Daisuke Nishimura   memcg: clean up m...
4554
4555
  	struct page *page = NULL;
  	swp_entry_t ent = pte_to_swp_entry(ptent);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4556
  	if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
90254a658   Daisuke Nishimura   memcg: clean up m...
4557
  		return NULL;
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4558
4559
4560
4561
  	/*
  	 * Because lookup_swap_cache() updates some statistics counter,
  	 * we call find_get_page() with swapper_space directly.
  	 */
33806f06d   Shaohua Li   swap: make each s...
4562
  	page = find_get_page(swap_address_space(ent), ent.val);
90254a658   Daisuke Nishimura   memcg: clean up m...
4563
4564
4565
4566
4567
  	if (do_swap_account)
  		entry->val = ent.val;
  
  	return page;
  }
4b91355e9   KAMEZAWA Hiroyuki   memcg: fix/change...
4568
4569
4570
4571
4572
4573
4574
  #else
  static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	return NULL;
  }
  #endif
90254a658   Daisuke Nishimura   memcg: clean up m...
4575

87946a722   Daisuke Nishimura   memcg: move charg...
4576
4577
4578
4579
  static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
  			unsigned long addr, pte_t ptent, swp_entry_t *entry)
  {
  	struct page *page = NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4580
4581
4582
4583
4584
  	struct address_space *mapping;
  	pgoff_t pgoff;
  
  	if (!vma->vm_file) /* anonymous vma */
  		return NULL;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4585
  	if (!(mc.flags & MOVE_FILE))
87946a722   Daisuke Nishimura   memcg: move charg...
4586
  		return NULL;
87946a722   Daisuke Nishimura   memcg: move charg...
4587
  	mapping = vma->vm_file->f_mapping;
0661a3361   Kirill A. Shutemov   mm: remove rest u...
4588
  	pgoff = linear_page_index(vma, addr);
87946a722   Daisuke Nishimura   memcg: move charg...
4589
4590
  
  	/* page is moved even if it's not RSS of this task(page-faulted). */
aa3b18955   Hugh Dickins   tmpfs: convert me...
4591
4592
  #ifdef CONFIG_SWAP
  	/* shmem/tmpfs may report page out on swap: account for that too. */
139b6a6fb   Johannes Weiner   mm: filemap: upda...
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
  	if (shmem_mapping(mapping)) {
  		page = find_get_entry(mapping, pgoff);
  		if (radix_tree_exceptional_entry(page)) {
  			swp_entry_t swp = radix_to_swp_entry(page);
  			if (do_swap_account)
  				*entry = swp;
  			page = find_get_page(swap_address_space(swp), swp.val);
  		}
  	} else
  		page = find_get_page(mapping, pgoff);
  #else
  	page = find_get_page(mapping, pgoff);
aa3b18955   Hugh Dickins   tmpfs: convert me...
4605
  #endif
87946a722   Daisuke Nishimura   memcg: move charg...
4606
4607
  	return page;
  }
b1b0deabb   Chen Gang   mm: memcontrol: l...
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
  /**
   * mem_cgroup_move_account - move account of the page
   * @page: the page
   * @nr_pages: number of regular pages (>1 for huge pages)
   * @from: mem_cgroup which the page is moved from.
   * @to:	mem_cgroup which the page is moved to. @from != @to.
   *
   * The caller must confirm following.
   * - page is not on LRU (isolate_page() is useful.)
   * - compound_lock is held when nr_pages > 1
   *
   * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
   * from old cgroup.
   */
  static int mem_cgroup_move_account(struct page *page,
  				   unsigned int nr_pages,
  				   struct mem_cgroup *from,
  				   struct mem_cgroup *to)
  {
  	unsigned long flags;
  	int ret;
c4843a759   Greg Thelen   memcg: add per cg...
4629
  	bool anon;
b1b0deabb   Chen Gang   mm: memcontrol: l...
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
  
  	VM_BUG_ON(from == to);
  	VM_BUG_ON_PAGE(PageLRU(page), page);
  	/*
  	 * The page is isolated from LRU. So, collapse function
  	 * will not handle this page. But page splitting can happen.
  	 * Do this check under compound_page_lock(). The caller should
  	 * hold it.
  	 */
  	ret = -EBUSY;
  	if (nr_pages > 1 && !PageTransHuge(page))
  		goto out;
  
  	/*
  	 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
  	 * of its source page while we change it: page migration takes
  	 * both pages off the LRU, but page cache replacement doesn't.
  	 */
  	if (!trylock_page(page))
  		goto out;
  
  	ret = -EINVAL;
  	if (page->mem_cgroup != from)
  		goto out_unlock;
c4843a759   Greg Thelen   memcg: add per cg...
4654
  	anon = PageAnon(page);
b1b0deabb   Chen Gang   mm: memcontrol: l...
4655
  	spin_lock_irqsave(&from->move_lock, flags);
c4843a759   Greg Thelen   memcg: add per cg...
4656
  	if (!anon && page_mapped(page)) {
b1b0deabb   Chen Gang   mm: memcontrol: l...
4657
4658
4659
4660
4661
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
  			       nr_pages);
  	}
c4843a759   Greg Thelen   memcg: add per cg...
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
  	/*
  	 * move_lock grabbed above and caller set from->moving_account, so
  	 * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
  	 * So mapping should be stable for dirty pages.
  	 */
  	if (!anon && PageDirty(page)) {
  		struct address_space *mapping = page_mapping(page);
  
  		if (mapping_cap_account_dirty(mapping)) {
  			__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
  				       nr_pages);
  			__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
  				       nr_pages);
  		}
  	}
b1b0deabb   Chen Gang   mm: memcontrol: l...
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
  	if (PageWriteback(page)) {
  		__this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  		__this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
  			       nr_pages);
  	}
  
  	/*
  	 * It is safe to change page->mem_cgroup here because the page
  	 * is referenced, charged, and isolated - we can't race with
  	 * uncharging, charging, migration, or LRU putback.
  	 */
  
  	/* caller should have done css_get */
  	page->mem_cgroup = to;
  	spin_unlock_irqrestore(&from->move_lock, flags);
  
  	ret = 0;
  
  	local_irq_disable();
  	mem_cgroup_charge_statistics(to, page, nr_pages);
  	memcg_check_events(to, page);
  	mem_cgroup_charge_statistics(from, page, -nr_pages);
  	memcg_check_events(from, page);
  	local_irq_enable();
  out_unlock:
  	unlock_page(page);
  out:
  	return ret;
  }
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4707
  static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
90254a658   Daisuke Nishimura   memcg: clean up m...
4708
4709
4710
  		unsigned long addr, pte_t ptent, union mc_target *target)
  {
  	struct page *page = NULL;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4711
  	enum mc_target_type ret = MC_TARGET_NONE;
90254a658   Daisuke Nishimura   memcg: clean up m...
4712
4713
4714
4715
4716
4717
  	swp_entry_t ent = { .val = 0 };
  
  	if (pte_present(ptent))
  		page = mc_handle_present_pte(vma, addr, ptent);
  	else if (is_swap_pte(ptent))
  		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
0661a3361   Kirill A. Shutemov   mm: remove rest u...
4718
  	else if (pte_none(ptent))
87946a722   Daisuke Nishimura   memcg: move charg...
4719
  		page = mc_handle_file_pte(vma, addr, ptent, &ent);
90254a658   Daisuke Nishimura   memcg: clean up m...
4720
4721
  
  	if (!page && !ent.val)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4722
  		return ret;
024914477   Daisuke Nishimura   memcg: move charg...
4723
  	if (page) {
024914477   Daisuke Nishimura   memcg: move charg...
4724
  		/*
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
4725
  		 * Do only loose check w/o serialization.
1306a85ae   Johannes Weiner   mm: embed the mem...
4726
  		 * mem_cgroup_move_account() checks the page is valid or
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
4727
  		 * not under LRU exclusion.
024914477   Daisuke Nishimura   memcg: move charg...
4728
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
4729
  		if (page->mem_cgroup == mc.from) {
024914477   Daisuke Nishimura   memcg: move charg...
4730
4731
4732
4733
4734
4735
4736
  			ret = MC_TARGET_PAGE;
  			if (target)
  				target->page = page;
  		}
  		if (!ret || !target)
  			put_page(page);
  	}
90254a658   Daisuke Nishimura   memcg: clean up m...
4737
4738
  	/* There is a swap entry and a page doesn't exist or isn't charged */
  	if (ent.val && !ret &&
34c00c319   Li Zefan   memcg: convert to...
4739
  	    mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
7f0f15464   KAMEZAWA Hiroyuki   memcg: fix css_id...
4740
4741
4742
  		ret = MC_TARGET_SWAP;
  		if (target)
  			target->ent = ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4743
  	}
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4744
4745
  	return ret;
  }
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  /*
   * We don't consider swapping or file mapped pages because THP does not
   * support them for now.
   * Caller should make sure that pmd_trans_huge(pmd) is true.
   */
  static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	struct page *page = NULL;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4756
4757
4758
  	enum mc_target_type ret = MC_TARGET_NONE;
  
  	page = pmd_page(pmd);
309381fea   Sasha Levin   mm: dump page whe...
4759
  	VM_BUG_ON_PAGE(!page || !PageHead(page), page);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4760
  	if (!(mc.flags & MOVE_ANON))
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4761
  		return ret;
1306a85ae   Johannes Weiner   mm: embed the mem...
4762
  	if (page->mem_cgroup == mc.from) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
  		ret = MC_TARGET_PAGE;
  		if (target) {
  			get_page(page);
  			target->page = page;
  		}
  	}
  	return ret;
  }
  #else
  static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
  		unsigned long addr, pmd_t pmd, union mc_target *target)
  {
  	return MC_TARGET_NONE;
  }
  #endif
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4778
4779
4780
4781
  static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
  					unsigned long addr, unsigned long end,
  					struct mm_walk *walk)
  {
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4782
  	struct vm_area_struct *vma = walk->vma;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4783
4784
  	pte_t *pte;
  	spinlock_t *ptl;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4785
  	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4786
4787
  		if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
  			mc.precharge += HPAGE_PMD_NR;
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4788
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
4789
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4790
  	}
033193275   Dave Hansen   pagewalk: only sp...
4791

45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
4792
4793
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4794
4795
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; pte++, addr += PAGE_SIZE)
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4796
  		if (get_mctgt_type(vma, addr, *pte, NULL))
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4797
4798
4799
  			mc.precharge++;	/* increment precharge temporarily */
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4800
4801
  	return 0;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4802
4803
4804
  static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
  {
  	unsigned long precharge;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4805

26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4806
4807
4808
4809
  	struct mm_walk mem_cgroup_count_precharge_walk = {
  		.pmd_entry = mem_cgroup_count_precharge_pte_range,
  		.mm = mm,
  	};
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4810
  	down_read(&mm->mmap_sem);
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4811
  	walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4812
  	up_read(&mm->mmap_sem);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4813
4814
4815
4816
4817
4818
  
  	precharge = mc.precharge;
  	mc.precharge = 0;
  
  	return precharge;
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4819
4820
  static int mem_cgroup_precharge_mc(struct mm_struct *mm)
  {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4821
4822
4823
4824
4825
  	unsigned long precharge = mem_cgroup_count_precharge(mm);
  
  	VM_BUG_ON(mc.moving_task);
  	mc.moving_task = current;
  	return mem_cgroup_do_precharge(precharge);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4826
  }
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4827
4828
  /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
  static void __mem_cgroup_clear_mc(void)
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4829
  {
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4830
4831
  	struct mem_cgroup *from = mc.from;
  	struct mem_cgroup *to = mc.to;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4832
  	/* we must uncharge all the leftover precharges from mc.to */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4833
  	if (mc.precharge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4834
  		cancel_charge(mc.to, mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4835
4836
4837
4838
4839
4840
4841
  		mc.precharge = 0;
  	}
  	/*
  	 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
  	 * we must uncharge here.
  	 */
  	if (mc.moved_charge) {
00501b531   Johannes Weiner   mm: memcontrol: r...
4842
  		cancel_charge(mc.from, mc.moved_charge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4843
  		mc.moved_charge = 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4844
  	}
483c30b51   Daisuke Nishimura   memcg: improve pe...
4845
4846
  	/* we must fixup refcnts and charges */
  	if (mc.moved_swap) {
483c30b51   Daisuke Nishimura   memcg: improve pe...
4847
  		/* uncharge swap account from the old cgroup */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4848
  		if (!mem_cgroup_is_root(mc.from))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4849
  			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
483c30b51   Daisuke Nishimura   memcg: improve pe...
4850

05b843012   Johannes Weiner   mm: memcontrol: u...
4851
  		/*
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4852
4853
  		 * we charged both to->memory and to->memsw, so we
  		 * should uncharge to->memory.
05b843012   Johannes Weiner   mm: memcontrol: u...
4854
  		 */
ce00a9673   Johannes Weiner   mm: memcontrol: r...
4855
  		if (!mem_cgroup_is_root(mc.to))
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4856
  			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
4857
  		css_put_many(&mc.from->css, mc.moved_swap);
3e32cb2e0   Johannes Weiner   mm: memcontrol: l...
4858

4050377b5   Li Zefan   memcg: use css_ge...
4859
  		/* we've already done css_get(mc.to) */
483c30b51   Daisuke Nishimura   memcg: improve pe...
4860
4861
  		mc.moved_swap = 0;
  	}
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4862
4863
4864
4865
4866
4867
4868
  	memcg_oom_recover(from);
  	memcg_oom_recover(to);
  	wake_up_all(&mc.waitq);
  }
  
  static void mem_cgroup_clear_mc(void)
  {
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4869
4870
4871
4872
4873
4874
  	/*
  	 * we must clear moving_task before waking up waiters at the end of
  	 * task migration.
  	 */
  	mc.moving_task = NULL;
  	__mem_cgroup_clear_mc();
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4875
  	spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4876
4877
  	mc.from = NULL;
  	mc.to = NULL;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4878
  	spin_unlock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4879
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4880
  static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4881
  				 struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4882
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
4883
  	struct task_struct *p = cgroup_taskset_first(tset);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4884
  	int ret = 0;
eb95419b0   Tejun Heo   cgroup: pass arou...
4885
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4886
  	unsigned long move_flags;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4887

ee5e8472b   Glauber Costa   memcg: prevent ch...
4888
4889
4890
4891
4892
  	/*
  	 * We are now commited to this value whatever it is. Changes in this
  	 * tunable will only affect upcoming migrations, not the current one.
  	 * So we need to save it, and keep it going.
  	 */
4db0c3c29   Jason Low   mm: remove rest o...
4893
  	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4894
  	if (move_flags) {
7dc74be03   Daisuke Nishimura   memcg: add interf...
4895
4896
  		struct mm_struct *mm;
  		struct mem_cgroup *from = mem_cgroup_from_task(p);
c0ff4b854   Raghavendra K T   memcg: rename mem...
4897
  		VM_BUG_ON(from == memcg);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4898
4899
4900
4901
  
  		mm = get_task_mm(p);
  		if (!mm)
  			return 0;
7dc74be03   Daisuke Nishimura   memcg: add interf...
4902
  		/* We move charges only when we move a owner of the mm */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4903
4904
4905
4906
  		if (mm->owner == p) {
  			VM_BUG_ON(mc.from);
  			VM_BUG_ON(mc.to);
  			VM_BUG_ON(mc.precharge);
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4907
  			VM_BUG_ON(mc.moved_charge);
483c30b51   Daisuke Nishimura   memcg: improve pe...
4908
  			VM_BUG_ON(mc.moved_swap);
247b1447b   Johannes Weiner   mm: memcontrol: f...
4909

2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4910
  			spin_lock(&mc.lock);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4911
  			mc.from = from;
c0ff4b854   Raghavendra K T   memcg: rename mem...
4912
  			mc.to = memcg;
1dfab5abc   Johannes Weiner   mm: memcontrol: f...
4913
  			mc.flags = move_flags;
2bd9bb206   KAMEZAWA Hiroyuki   memcg: clean up w...
4914
  			spin_unlock(&mc.lock);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4915
  			/* We set mc.moving_task later */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4916
4917
4918
4919
  
  			ret = mem_cgroup_precharge_mc(mm);
  			if (ret)
  				mem_cgroup_clear_mc();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
4920
4921
  		}
  		mmput(mm);
7dc74be03   Daisuke Nishimura   memcg: add interf...
4922
4923
4924
  	}
  	return ret;
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
4925
  static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
4926
  				     struct cgroup_taskset *tset)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4927
  {
4e2f245d3   Johannes Weiner   mm: memcontrol: d...
4928
4929
  	if (mc.to)
  		mem_cgroup_clear_mc();
7dc74be03   Daisuke Nishimura   memcg: add interf...
4930
  }
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4931
4932
4933
  static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
  				unsigned long addr, unsigned long end,
  				struct mm_walk *walk)
7dc74be03   Daisuke Nishimura   memcg: add interf...
4934
  {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4935
  	int ret = 0;
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
4936
  	struct vm_area_struct *vma = walk->vma;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4937
4938
  	pte_t *pte;
  	spinlock_t *ptl;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4939
4940
4941
  	enum mc_target_type target_type;
  	union mc_target target;
  	struct page *page;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4942

12724850e   Naoya Horiguchi   memcg: avoid THP ...
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
  	/*
  	 * We don't take compound_lock() here but no race with splitting thp
  	 * happens because:
  	 *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
  	 *    under splitting, which means there's no concurrent thp split,
  	 *  - if another thread runs into split_huge_page() just after we
  	 *    entered this if-block, the thread must wait for page table lock
  	 *    to be unlocked in __split_huge_page_splitting(), where the main
  	 *    part of thp split is not executed yet.
  	 */
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4953
  	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
62ade86ab   Hugh Dickins   memcg,thp: fix re...
4954
  		if (mc.precharge < HPAGE_PMD_NR) {
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4955
  			spin_unlock(ptl);
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4956
4957
4958
4959
4960
4961
  			return 0;
  		}
  		target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
  		if (target_type == MC_TARGET_PAGE) {
  			page = target.page;
  			if (!isolate_lru_page(page)) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4962
  				if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
1306a85ae   Johannes Weiner   mm: embed the mem...
4963
  							     mc.from, mc.to)) {
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4964
4965
4966
4967
4968
4969
4970
  					mc.precharge -= HPAGE_PMD_NR;
  					mc.moved_charge += HPAGE_PMD_NR;
  				}
  				putback_lru_page(page);
  			}
  			put_page(page);
  		}
bf929152e   Kirill A. Shutemov   mm, thp: change p...
4971
  		spin_unlock(ptl);
1a5a9906d   Andrea Arcangeli   mm: thp: fix pmd_...
4972
  		return 0;
12724850e   Naoya Horiguchi   memcg: avoid THP ...
4973
  	}
45f83cefe   Andrea Arcangeli   mm: thp: fix up p...
4974
4975
  	if (pmd_trans_unstable(pmd))
  		return 0;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4976
4977
4978
4979
  retry:
  	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  	for (; addr != end; addr += PAGE_SIZE) {
  		pte_t ptent = *(pte++);
024914477   Daisuke Nishimura   memcg: move charg...
4980
  		swp_entry_t ent;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4981
4982
4983
  
  		if (!mc.precharge)
  			break;
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4984
  		switch (get_mctgt_type(vma, addr, ptent, &target)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4985
4986
4987
4988
  		case MC_TARGET_PAGE:
  			page = target.page;
  			if (isolate_lru_page(page))
  				goto put;
1306a85ae   Johannes Weiner   mm: embed the mem...
4989
  			if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4990
  				mc.precharge--;
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
4991
4992
  				/* we uncharge from mc.from later. */
  				mc.moved_charge++;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4993
4994
  			}
  			putback_lru_page(page);
8d32ff844   Naoya Horiguchi   memcg: clean up e...
4995
  put:			/* get_mctgt_type() gets the page */
4ffef5fef   Daisuke Nishimura   memcg: move charg...
4996
4997
  			put_page(page);
  			break;
024914477   Daisuke Nishimura   memcg: move charg...
4998
4999
  		case MC_TARGET_SWAP:
  			ent = target.ent;
e91cbb425   Hugh Dickins   memcg swap: mem_c...
5000
  			if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
024914477   Daisuke Nishimura   memcg: move charg...
5001
  				mc.precharge--;
483c30b51   Daisuke Nishimura   memcg: improve pe...
5002
5003
5004
  				/* we fixup refcnts and charges later. */
  				mc.moved_swap++;
  			}
024914477   Daisuke Nishimura   memcg: move charg...
5005
  			break;
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
  		default:
  			break;
  		}
  	}
  	pte_unmap_unlock(pte - 1, ptl);
  	cond_resched();
  
  	if (addr != end) {
  		/*
  		 * We have consumed all precharges we got in can_attach().
  		 * We try charge one by one, but don't do any additional
  		 * charges to mc.to if we have failed in charge once in attach()
  		 * phase.
  		 */
854ffa8d1   Daisuke Nishimura   memcg: improve pe...
5020
  		ret = mem_cgroup_do_precharge(1);
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5021
5022
5023
5024
5025
5026
5027
5028
5029
  		if (!ret)
  			goto retry;
  	}
  
  	return ret;
  }
  
  static void mem_cgroup_move_charge(struct mm_struct *mm)
  {
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
5030
5031
5032
5033
  	struct mm_walk mem_cgroup_move_charge_walk = {
  		.pmd_entry = mem_cgroup_move_charge_pte_range,
  		.mm = mm,
  	};
4ffef5fef   Daisuke Nishimura   memcg: move charg...
5034
5035
  
  	lru_add_drain_all();
312722cbb   Johannes Weiner   mm: memcontrol: s...
5036
5037
5038
5039
5040
5041
5042
  	/*
  	 * Signal mem_cgroup_begin_page_stat() to take the memcg's
  	 * move_lock while we're moving its pages to another memcg.
  	 * Then wait for already started RCU-only updates to finish.
  	 */
  	atomic_inc(&mc.from->moving_account);
  	synchronize_rcu();
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
  retry:
  	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
  		/*
  		 * Someone who are holding the mmap_sem might be waiting in
  		 * waitq. So we cancel all extra charges, wake up all waiters,
  		 * and retry. Because we cancel precharges, we might not be able
  		 * to move enough charges, but moving charge is a best-effort
  		 * feature anyway, so it wouldn't be a big problem.
  		 */
  		__mem_cgroup_clear_mc();
  		cond_resched();
  		goto retry;
  	}
26bcd64aa   Naoya Horiguchi   memcg: cleanup pr...
5056
5057
5058
5059
5060
  	/*
  	 * When we have consumed all precharges and failed in doing
  	 * additional charge, the page walk just aborts.
  	 */
  	walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5061
  	up_read(&mm->mmap_sem);
312722cbb   Johannes Weiner   mm: memcontrol: s...
5062
  	atomic_dec(&mc.from->moving_account);
7dc74be03   Daisuke Nishimura   memcg: add interf...
5063
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5064
  static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5065
  				 struct cgroup_taskset *tset)
67e465a77   Balbir Singh   Memory controller...
5066
  {
2f7ee5691   Tejun Heo   cgroup: introduce...
5067
  	struct task_struct *p = cgroup_taskset_first(tset);
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5068
  	struct mm_struct *mm = get_task_mm(p);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5069

dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5070
  	if (mm) {
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5071
5072
  		if (mc.to)
  			mem_cgroup_move_charge(mm);
dfe076b09   Daisuke Nishimura   memcg: fix deadlo...
5073
5074
  		mmput(mm);
  	}
a433658c3   KOSAKI Motohiro   vmscan,memcg: mem...
5075
5076
  	if (mc.to)
  		mem_cgroup_clear_mc();
67e465a77   Balbir Singh   Memory controller...
5077
  }
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5078
  #else	/* !CONFIG_MMU */
eb95419b0   Tejun Heo   cgroup: pass arou...
5079
  static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5080
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5081
5082
5083
  {
  	return 0;
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5084
  static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5085
  				     struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5086
5087
  {
  }
eb95419b0   Tejun Heo   cgroup: pass arou...
5088
  static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
761b3ef50   Li Zefan   cgroup: remove cg...
5089
  				 struct cgroup_taskset *tset)
5cfb80a73   Daisuke Nishimura   memcg: disable mo...
5090
5091
5092
  {
  }
  #endif
67e465a77   Balbir Singh   Memory controller...
5093

f00baae7a   Tejun Heo   memcg: force use_...
5094
5095
  /*
   * Cgroup retains root cgroups across [un]mount cycles making it necessary
aa6ec29be   Tejun Heo   cgroup: remove sa...
5096
5097
   * to verify whether we're attached to the default hierarchy on each mount
   * attempt.
f00baae7a   Tejun Heo   memcg: force use_...
5098
   */
eb95419b0   Tejun Heo   cgroup: pass arou...
5099
  static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
f00baae7a   Tejun Heo   memcg: force use_...
5100
5101
  {
  	/*
aa6ec29be   Tejun Heo   cgroup: remove sa...
5102
  	 * use_hierarchy is forced on the default hierarchy.  cgroup core
f00baae7a   Tejun Heo   memcg: force use_...
5103
5104
5105
  	 * guarantees that @root doesn't have any children, so turning it
  	 * on for the root memcg is enough.
  	 */
aa6ec29be   Tejun Heo   cgroup: remove sa...
5106
  	if (cgroup_on_dfl(root_css->cgroup))
7feee590b   Vladimir Davydov   memcg: disable hi...
5107
5108
5109
  		root_mem_cgroup->use_hierarchy = true;
  	else
  		root_mem_cgroup->use_hierarchy = false;
f00baae7a   Tejun Heo   memcg: force use_...
5110
  }
241994ed8   Johannes Weiner   mm: memcontrol: d...
5111
5112
5113
5114
5115
5116
5117
5118
5119
  static u64 memory_current_read(struct cgroup_subsys_state *css,
  			       struct cftype *cft)
  {
  	return mem_cgroup_usage(mem_cgroup_from_css(css), false);
  }
  
  static int memory_low_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
5120
  	unsigned long low = READ_ONCE(memcg->low);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5121
5122
  
  	if (low == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
5123
5124
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
  	else
  		seq_printf(m, "%llu
  ", (u64)low * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_low_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long low;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
5140
  	err = page_counter_memparse(buf, "max", &low);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
  	if (err)
  		return err;
  
  	memcg->low = low;
  
  	return nbytes;
  }
  
  static int memory_high_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
5152
  	unsigned long high = READ_ONCE(memcg->high);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5153
5154
  
  	if (high == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
5155
5156
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
  	else
  		seq_printf(m, "%llu
  ", (u64)high * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_high_write(struct kernfs_open_file *of,
  				 char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long high;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
5172
  	err = page_counter_memparse(buf, "max", &high);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5173
5174
5175
5176
  	if (err)
  		return err;
  
  	memcg->high = high;
2529bb3aa   Tejun Heo   writeback: reset ...
5177
  	memcg_wb_domain_size_changed(memcg);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5178
5179
5180
5181
5182
5183
  	return nbytes;
  }
  
  static int memory_max_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4db0c3c29   Jason Low   mm: remove rest o...
5184
  	unsigned long max = READ_ONCE(memcg->memory.limit);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5185
5186
  
  	if (max == PAGE_COUNTER_MAX)
d2973697b   Johannes Weiner   mm: memcontrol: u...
5187
5188
  		seq_puts(m, "max
  ");
241994ed8   Johannes Weiner   mm: memcontrol: d...
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
  	else
  		seq_printf(m, "%llu
  ", (u64)max * PAGE_SIZE);
  
  	return 0;
  }
  
  static ssize_t memory_max_write(struct kernfs_open_file *of,
  				char *buf, size_t nbytes, loff_t off)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
  	unsigned long max;
  	int err;
  
  	buf = strstrip(buf);
d2973697b   Johannes Weiner   mm: memcontrol: u...
5204
  	err = page_counter_memparse(buf, "max", &max);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5205
5206
5207
5208
5209
5210
  	if (err)
  		return err;
  
  	err = mem_cgroup_resize_limit(memcg, max);
  	if (err)
  		return err;
2529bb3aa   Tejun Heo   writeback: reset ...
5211
  	memcg_wb_domain_size_changed(memcg);
241994ed8   Johannes Weiner   mm: memcontrol: d...
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
  	return nbytes;
  }
  
  static int memory_events_show(struct seq_file *m, void *v)
  {
  	struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
  
  	seq_printf(m, "low %lu
  ", mem_cgroup_read_events(memcg, MEMCG_LOW));
  	seq_printf(m, "high %lu
  ", mem_cgroup_read_events(memcg, MEMCG_HIGH));
  	seq_printf(m, "max %lu
  ", mem_cgroup_read_events(memcg, MEMCG_MAX));
  	seq_printf(m, "oom %lu
  ", mem_cgroup_read_events(memcg, MEMCG_OOM));
  
  	return 0;
  }
  
  static struct cftype memory_files[] = {
  	{
  		.name = "current",
  		.read_u64 = memory_current_read,
  	},
  	{
  		.name = "low",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_low_show,
  		.write = memory_low_write,
  	},
  	{
  		.name = "high",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_high_show,
  		.write = memory_high_write,
  	},
  	{
  		.name = "max",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_max_show,
  		.write = memory_max_write,
  	},
  	{
  		.name = "events",
  		.flags = CFTYPE_NOT_ON_ROOT,
  		.seq_show = memory_events_show,
  	},
  	{ }	/* terminate */
  };
073219e99   Tejun Heo   cgroup: clean up ...
5261
  struct cgroup_subsys memory_cgrp_subsys = {
92fb97487   Tejun Heo   cgroup: rename ->...
5262
  	.css_alloc = mem_cgroup_css_alloc,
d142e3e66   Glauber Costa   memcg: split part...
5263
  	.css_online = mem_cgroup_css_online,
92fb97487   Tejun Heo   cgroup: rename ->...
5264
5265
  	.css_offline = mem_cgroup_css_offline,
  	.css_free = mem_cgroup_css_free,
1ced953b1   Tejun Heo   blkcg, memcg: mak...
5266
  	.css_reset = mem_cgroup_css_reset,
7dc74be03   Daisuke Nishimura   memcg: add interf...
5267
5268
  	.can_attach = mem_cgroup_can_attach,
  	.cancel_attach = mem_cgroup_cancel_attach,
67e465a77   Balbir Singh   Memory controller...
5269
  	.attach = mem_cgroup_move_task,
f00baae7a   Tejun Heo   memcg: force use_...
5270
  	.bind = mem_cgroup_bind,
241994ed8   Johannes Weiner   mm: memcontrol: d...
5271
5272
  	.dfl_cftypes = memory_files,
  	.legacy_cftypes = mem_cgroup_legacy_files,
6d12e2d8d   KAMEZAWA Hiroyuki   per-zone and recl...
5273
  	.early_init = 0,
8cdea7c05   Balbir Singh   Memory controller...
5274
  };
c077719be   KAMEZAWA Hiroyuki   memcg: mem+swap c...
5275

241994ed8   Johannes Weiner   mm: memcontrol: d...
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
  /**
   * mem_cgroup_events - count memory events against a cgroup
   * @memcg: the memory cgroup
   * @idx: the event index
   * @nr: the number of events to account for
   */
  void mem_cgroup_events(struct mem_cgroup *memcg,
  		       enum mem_cgroup_events_index idx,
  		       unsigned int nr)
  {
  	this_cpu_add(memcg->stat->events[idx], nr);
  }
  
  /**
   * mem_cgroup_low - check if memory consumption is below the normal range
   * @root: the highest ancestor to consider
   * @memcg: the memory cgroup to check
   *
   * Returns %true if memory consumption of @memcg, and that of all
   * configurable ancestors up to @root, is below the normal range.
   */
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
  {
  	if (mem_cgroup_disabled())
  		return false;
  
  	/*
  	 * The toplevel group doesn't have a configurable range, so
  	 * it's never low when looked at directly, and it is not
  	 * considered an ancestor when assessing the hierarchy.
  	 */
  
  	if (memcg == root_mem_cgroup)
  		return false;
4e54dede3   Michal Hocko   memcg: fix low li...
5310
  	if (page_counter_read(&memcg->memory) >= memcg->low)
241994ed8   Johannes Weiner   mm: memcontrol: d...
5311
5312
5313
5314
5315
5316
5317
  		return false;
  
  	while (memcg != root) {
  		memcg = parent_mem_cgroup(memcg);
  
  		if (memcg == root_mem_cgroup)
  			break;
4e54dede3   Michal Hocko   memcg: fix low li...
5318
  		if (page_counter_read(&memcg->memory) >= memcg->low)
241994ed8   Johannes Weiner   mm: memcontrol: d...
5319
5320
5321
5322
  			return false;
  	}
  	return true;
  }
00501b531   Johannes Weiner   mm: memcontrol: r...
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
  /**
   * mem_cgroup_try_charge - try charging a page
   * @page: page to charge
   * @mm: mm context of the victim
   * @gfp_mask: reclaim mode
   * @memcgp: charged memcg return
   *
   * Try to charge @page to the memcg that @mm belongs to, reclaiming
   * pages according to @gfp_mask if necessary.
   *
   * Returns 0 on success, with *@memcgp pointing to the charged memcg.
   * Otherwise, an error code is returned.
   *
   * After page->mapping has been set up, the caller must finalize the
   * charge with mem_cgroup_commit_charge().  Or abort the transaction
   * with mem_cgroup_cancel_charge() in case page instantiation fails.
   */
  int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
  			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
  {
  	struct mem_cgroup *memcg = NULL;
  	unsigned int nr_pages = 1;
  	int ret = 0;
  
  	if (mem_cgroup_disabled())
  		goto out;
  
  	if (PageSwapCache(page)) {
00501b531   Johannes Weiner   mm: memcontrol: r...
5351
5352
5353
5354
5355
5356
5357
  		/*
  		 * Every swap fault against a single page tries to charge the
  		 * page, bail as early as possible.  shmem_unuse() encounters
  		 * already charged pages, too.  The USED bit is protected by
  		 * the page lock, which serializes swap cache removal, which
  		 * in turn serializes uncharging.
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
5358
  		if (page->mem_cgroup)
00501b531   Johannes Weiner   mm: memcontrol: r...
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
  			goto out;
  	}
  
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
  
  	if (do_swap_account && PageSwapCache(page))
  		memcg = try_get_mem_cgroup_from_page(page);
  	if (!memcg)
  		memcg = get_mem_cgroup_from_mm(mm);
  
  	ret = try_charge(memcg, gfp_mask, nr_pages);
  
  	css_put(&memcg->css);
  
  	if (ret == -EINTR) {
  		memcg = root_mem_cgroup;
  		ret = 0;
  	}
  out:
  	*memcgp = memcg;
  	return ret;
  }
  
  /**
   * mem_cgroup_commit_charge - commit a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   * @lrucare: page might be on LRU already
   *
   * Finalize a charge transaction started by mem_cgroup_try_charge(),
   * after page->mapping has been set up.  This must happen atomically
   * as part of the page instantiation, i.e. under the page table lock
   * for anonymous pages, under the page lock for page and swap cache.
   *
   * In addition, the page must not be on the LRU during the commit, to
   * prevent racing with task migration.  If it might be, use @lrucare.
   *
   * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
   */
  void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
  			      bool lrucare)
  {
  	unsigned int nr_pages = 1;
  
  	VM_BUG_ON_PAGE(!page->mapping, page);
  	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5418
  	commit_charge(page, memcg, lrucare);
00501b531   Johannes Weiner   mm: memcontrol: r...
5419
5420
5421
5422
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5423
5424
5425
5426
  	local_irq_disable();
  	mem_cgroup_charge_statistics(memcg, page, nr_pages);
  	memcg_check_events(memcg, page);
  	local_irq_enable();
00501b531   Johannes Weiner   mm: memcontrol: r...
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
  
  	if (do_swap_account && PageSwapCache(page)) {
  		swp_entry_t entry = { .val = page_private(page) };
  		/*
  		 * The swap entry might not get freed for a long time,
  		 * let's not wait for it.  The page already received a
  		 * memory+swap charge, drop the swap entry duplicate.
  		 */
  		mem_cgroup_uncharge_swap(entry);
  	}
  }
  
  /**
   * mem_cgroup_cancel_charge - cancel a page charge
   * @page: page to charge
   * @memcg: memcg to charge the page to
   *
   * Cancel a charge transaction started by mem_cgroup_try_charge().
   */
  void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
  {
  	unsigned int nr_pages = 1;
  
  	if (mem_cgroup_disabled())
  		return;
  	/*
  	 * Swap faults will attempt to charge the same page multiple
  	 * times.  But reuse_swap_page() might have removed the page
  	 * from swapcache already, so we can't check PageSwapCache().
  	 */
  	if (!memcg)
  		return;
  
  	if (PageTransHuge(page)) {
  		nr_pages <<= compound_order(page);
  		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  	}
  
  	cancel_charge(memcg, nr_pages);
  }
747db954c   Johannes Weiner   mm: memcontrol: u...
5467
  static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
747db954c   Johannes Weiner   mm: memcontrol: u...
5468
5469
5470
  			   unsigned long nr_anon, unsigned long nr_file,
  			   unsigned long nr_huge, struct page *dummy_page)
  {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5471
  	unsigned long nr_pages = nr_anon + nr_file;
747db954c   Johannes Weiner   mm: memcontrol: u...
5472
  	unsigned long flags;
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5473
  	if (!mem_cgroup_is_root(memcg)) {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5474
5475
5476
  		page_counter_uncharge(&memcg->memory, nr_pages);
  		if (do_swap_account)
  			page_counter_uncharge(&memcg->memsw, nr_pages);
ce00a9673   Johannes Weiner   mm: memcontrol: r...
5477
5478
  		memcg_oom_recover(memcg);
  	}
747db954c   Johannes Weiner   mm: memcontrol: u...
5479
5480
5481
5482
5483
5484
  
  	local_irq_save(flags);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
  	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
  	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5485
  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
747db954c   Johannes Weiner   mm: memcontrol: u...
5486
5487
  	memcg_check_events(memcg, dummy_page);
  	local_irq_restore(flags);
e8ea14cc6   Johannes Weiner   mm: memcontrol: t...
5488
5489
  
  	if (!mem_cgroup_is_root(memcg))
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5490
  		css_put_many(&memcg->css, nr_pages);
747db954c   Johannes Weiner   mm: memcontrol: u...
5491
5492
5493
5494
5495
  }
  
  static void uncharge_list(struct list_head *page_list)
  {
  	struct mem_cgroup *memcg = NULL;
747db954c   Johannes Weiner   mm: memcontrol: u...
5496
5497
5498
5499
  	unsigned long nr_anon = 0;
  	unsigned long nr_file = 0;
  	unsigned long nr_huge = 0;
  	unsigned long pgpgout = 0;
747db954c   Johannes Weiner   mm: memcontrol: u...
5500
5501
5502
5503
5504
5505
  	struct list_head *next;
  	struct page *page;
  
  	next = page_list->next;
  	do {
  		unsigned int nr_pages = 1;
747db954c   Johannes Weiner   mm: memcontrol: u...
5506
5507
5508
5509
5510
5511
  
  		page = list_entry(next, struct page, lru);
  		next = page->lru.next;
  
  		VM_BUG_ON_PAGE(PageLRU(page), page);
  		VM_BUG_ON_PAGE(page_count(page), page);
1306a85ae   Johannes Weiner   mm: embed the mem...
5512
  		if (!page->mem_cgroup)
747db954c   Johannes Weiner   mm: memcontrol: u...
5513
5514
5515
5516
  			continue;
  
  		/*
  		 * Nobody should be changing or seriously looking at
1306a85ae   Johannes Weiner   mm: embed the mem...
5517
  		 * page->mem_cgroup at this point, we have fully
298333157   Johannes Weiner   mm: memcontrol: r...
5518
  		 * exclusive access to the page.
747db954c   Johannes Weiner   mm: memcontrol: u...
5519
  		 */
1306a85ae   Johannes Weiner   mm: embed the mem...
5520
  		if (memcg != page->mem_cgroup) {
747db954c   Johannes Weiner   mm: memcontrol: u...
5521
  			if (memcg) {
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5522
5523
5524
  				uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
  					       nr_huge, page);
  				pgpgout = nr_anon = nr_file = nr_huge = 0;
747db954c   Johannes Weiner   mm: memcontrol: u...
5525
  			}
1306a85ae   Johannes Weiner   mm: embed the mem...
5526
  			memcg = page->mem_cgroup;
747db954c   Johannes Weiner   mm: memcontrol: u...
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
  		}
  
  		if (PageTransHuge(page)) {
  			nr_pages <<= compound_order(page);
  			VM_BUG_ON_PAGE(!PageTransHuge(page), page);
  			nr_huge += nr_pages;
  		}
  
  		if (PageAnon(page))
  			nr_anon += nr_pages;
  		else
  			nr_file += nr_pages;
1306a85ae   Johannes Weiner   mm: embed the mem...
5539
  		page->mem_cgroup = NULL;
747db954c   Johannes Weiner   mm: memcontrol: u...
5540
5541
5542
5543
5544
  
  		pgpgout++;
  	} while (next != page_list);
  
  	if (memcg)
18eca2e63   Johannes Weiner   mm: memcontrol: r...
5545
5546
  		uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
  			       nr_huge, page);
747db954c   Johannes Weiner   mm: memcontrol: u...
5547
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5548
5549
5550
5551
5552
5553
5554
5555
5556
  /**
   * mem_cgroup_uncharge - uncharge a page
   * @page: page to uncharge
   *
   * Uncharge a page previously charged with mem_cgroup_try_charge() and
   * mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge(struct page *page)
  {
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5557
5558
  	if (mem_cgroup_disabled())
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
5559
  	/* Don't touch page->lru of any random page, pre-check: */
1306a85ae   Johannes Weiner   mm: embed the mem...
5560
  	if (!page->mem_cgroup)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5561
  		return;
747db954c   Johannes Weiner   mm: memcontrol: u...
5562
5563
5564
  	INIT_LIST_HEAD(&page->lru);
  	uncharge_list(&page->lru);
  }
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5565

747db954c   Johannes Weiner   mm: memcontrol: u...
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
  /**
   * mem_cgroup_uncharge_list - uncharge a list of page
   * @page_list: list of pages to uncharge
   *
   * Uncharge a list of pages previously charged with
   * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
   */
  void mem_cgroup_uncharge_list(struct list_head *page_list)
  {
  	if (mem_cgroup_disabled())
  		return;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5577

747db954c   Johannes Weiner   mm: memcontrol: u...
5578
5579
  	if (!list_empty(page_list))
  		uncharge_list(page_list);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5580
5581
5582
5583
5584
5585
  }
  
  /**
   * mem_cgroup_migrate - migrate a charge to another page
   * @oldpage: currently charged page
   * @newpage: page to transfer the charge to
f5e03a498   Michal Hocko   memcg, shmem: fix...
5586
   * @lrucare: either or both pages might be on the LRU already
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5587
5588
5589
5590
5591
5592
5593
5594
   *
   * Migrate the charge from @oldpage to @newpage.
   *
   * Both pages must be locked, @newpage->mapping must be set up.
   */
  void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
  			bool lrucare)
  {
298333157   Johannes Weiner   mm: memcontrol: r...
5595
  	struct mem_cgroup *memcg;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5596
5597
5598
5599
5600
5601
5602
  	int isolated;
  
  	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
  	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
  	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
  	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
  	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6abb5a867   Johannes Weiner   mm: memcontrol: a...
5603
5604
  	VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
  		       newpage);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5605
5606
5607
5608
5609
  
  	if (mem_cgroup_disabled())
  		return;
  
  	/* Page cache replacement: new page already charged? */
1306a85ae   Johannes Weiner   mm: embed the mem...
5610
  	if (newpage->mem_cgroup)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5611
  		return;
7d5e32457   Johannes Weiner   mm: memcontrol: c...
5612
5613
5614
5615
5616
5617
  	/*
  	 * Swapcache readahead pages can get migrated before being
  	 * charged, and migration from compaction can happen to an
  	 * uncharged page when the PFN walker finds a page that
  	 * reclaim just put back on the LRU but has not released yet.
  	 */
1306a85ae   Johannes Weiner   mm: embed the mem...
5618
  	memcg = oldpage->mem_cgroup;
298333157   Johannes Weiner   mm: memcontrol: r...
5619
  	if (!memcg)
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5620
  		return;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5621
5622
  	if (lrucare)
  		lock_page_lru(oldpage, &isolated);
1306a85ae   Johannes Weiner   mm: embed the mem...
5623
  	oldpage->mem_cgroup = NULL;
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5624
5625
5626
  
  	if (lrucare)
  		unlock_page_lru(oldpage, isolated);
298333157   Johannes Weiner   mm: memcontrol: r...
5627
  	commit_charge(newpage, memcg, lrucare);
0a31bc97c   Johannes Weiner   mm: memcontrol: r...
5628
  }
2d11085e4   Michal Hocko   memcg: do not cre...
5629
  /*
1081312f9   Michal Hocko   memcg: cleanup me...
5630
5631
5632
5633
5634
5635
   * subsys_initcall() for memory controller.
   *
   * Some parts like hotcpu_notifier() have to be initialized from this context
   * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
   * everything that doesn't depend on a specific mem_cgroup structure should
   * be initialized from here.
2d11085e4   Michal Hocko   memcg: do not cre...
5636
5637
5638
   */
  static int __init mem_cgroup_init(void)
  {
95a045f63   Johannes Weiner   mm: memcontrol: c...
5639
  	int cpu, node;
2d11085e4   Michal Hocko   memcg: do not cre...
5640
  	hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
95a045f63   Johannes Weiner   mm: memcontrol: c...
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
  
  	for_each_possible_cpu(cpu)
  		INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
  			  drain_local_stock);
  
  	for_each_node(node) {
  		struct mem_cgroup_tree_per_node *rtpn;
  		int zone;
  
  		rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
  				    node_online(node) ? node : NUMA_NO_NODE);
  
  		for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  			struct mem_cgroup_tree_per_zone *rtpz;
  
  			rtpz = &rtpn->rb_tree_per_zone[zone];
  			rtpz->rb_root = RB_ROOT;
  			spin_lock_init(&rtpz->lock);
  		}
  		soft_limit_tree.rb_tree_per_node[node] = rtpn;
  	}
2d11085e4   Michal Hocko   memcg: do not cre...
5662
5663
5664
  	return 0;
  }
  subsys_initcall(mem_cgroup_init);
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
  
  #ifdef CONFIG_MEMCG_SWAP
  /**
   * mem_cgroup_swapout - transfer a memsw charge to swap
   * @page: page whose memsw charge to transfer
   * @entry: swap entry to move the charge to
   *
   * Transfer the memsw charge of @page to @entry.
   */
  void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	unsigned short oldid;
  
  	VM_BUG_ON_PAGE(PageLRU(page), page);
  	VM_BUG_ON_PAGE(page_count(page), page);
  
  	if (!do_swap_account)
  		return;
  
  	memcg = page->mem_cgroup;
  
  	/* Readahead page, never charged */
  	if (!memcg)
  		return;
  
  	oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
  	VM_BUG_ON_PAGE(oldid, page);
  	mem_cgroup_swap_statistics(memcg, true);
  
  	page->mem_cgroup = NULL;
  
  	if (!mem_cgroup_is_root(memcg))
  		page_counter_uncharge(&memcg->memory, 1);
f371763a7   Johannes Weiner   mm: memcontrol: f...
5699
  	/* Caller disabled preemption with mapping->tree_lock */
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
  	mem_cgroup_charge_statistics(memcg, page, -1);
  	memcg_check_events(memcg, page);
  }
  
  /**
   * mem_cgroup_uncharge_swap - uncharge a swap entry
   * @entry: swap entry to uncharge
   *
   * Drop the memsw charge associated with @entry.
   */
  void mem_cgroup_uncharge_swap(swp_entry_t entry)
  {
  	struct mem_cgroup *memcg;
  	unsigned short id;
  
  	if (!do_swap_account)
  		return;
  
  	id = swap_cgroup_record(entry, 0);
  	rcu_read_lock();
adbe427b9   Vladimir Davydov   memcg: zap mem_cg...
5720
  	memcg = mem_cgroup_from_id(id);
21afa38ee   Johannes Weiner   mm: memcontrol: c...
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
  	if (memcg) {
  		if (!mem_cgroup_is_root(memcg))
  			page_counter_uncharge(&memcg->memsw, 1);
  		mem_cgroup_swap_statistics(memcg, false);
  		css_put(&memcg->css);
  	}
  	rcu_read_unlock();
  }
  
  /* for remember boot option*/
  #ifdef CONFIG_MEMCG_SWAP_ENABLED
  static int really_do_swap_account __initdata = 1;
  #else
  static int really_do_swap_account __initdata;
  #endif
  
  static int __init enable_swap_account(char *s)
  {
  	if (!strcmp(s, "1"))
  		really_do_swap_account = 1;
  	else if (!strcmp(s, "0"))
  		really_do_swap_account = 0;
  	return 1;
  }
  __setup("swapaccount=", enable_swap_account);
  
  static struct cftype memsw_cgroup_files[] = {
  	{
  		.name = "memsw.usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.max_usage_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.limit_in_bytes",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
  		.write = mem_cgroup_write,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{
  		.name = "memsw.failcnt",
  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
  		.write = mem_cgroup_reset,
  		.read_u64 = mem_cgroup_read_u64,
  	},
  	{ },	/* terminate */
  };
  
  static int __init mem_cgroup_swap_init(void)
  {
  	if (!mem_cgroup_disabled() && really_do_swap_account) {
  		do_swap_account = 1;
  		WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
  						  memsw_cgroup_files));
  	}
  	return 0;
  }
  subsys_initcall(mem_cgroup_swap_init);
  
  #endif /* CONFIG_MEMCG_SWAP */