Commit e8ea14cc6eadfe2ea63e9989e16e62625a2619f8

Authored by Johannes Weiner
Committed by Linus Torvalds
1 parent 5ac8fb31ad

mm: memcontrol: take a css reference for each charged page

Charges currently pin the css indirectly by playing tricks during
css_offline(): user pages stall the offlining process until all of them
have been reparented, whereas kmemcg acquires a keep-alive reference if
outstanding kernel pages are detected at that point.

In preparation for removing all this complexity, make the pinning explicit
and acquire a css references for every charged page.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 81 additions and 13 deletions Inline Diff

include/linux/cgroup.h
1 #ifndef _LINUX_CGROUP_H 1 #ifndef _LINUX_CGROUP_H
2 #define _LINUX_CGROUP_H 2 #define _LINUX_CGROUP_H
3 /* 3 /*
4 * cgroup interface 4 * cgroup interface
5 * 5 *
6 * Copyright (C) 2003 BULL SA 6 * Copyright (C) 2003 BULL SA
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 */ 9 */
10 10
11 #include <linux/sched.h> 11 #include <linux/sched.h>
12 #include <linux/cpumask.h> 12 #include <linux/cpumask.h>
13 #include <linux/nodemask.h> 13 #include <linux/nodemask.h>
14 #include <linux/rcupdate.h> 14 #include <linux/rcupdate.h>
15 #include <linux/rculist.h> 15 #include <linux/rculist.h>
16 #include <linux/cgroupstats.h> 16 #include <linux/cgroupstats.h>
17 #include <linux/rwsem.h> 17 #include <linux/rwsem.h>
18 #include <linux/idr.h> 18 #include <linux/idr.h>
19 #include <linux/workqueue.h> 19 #include <linux/workqueue.h>
20 #include <linux/fs.h> 20 #include <linux/fs.h>
21 #include <linux/percpu-refcount.h> 21 #include <linux/percpu-refcount.h>
22 #include <linux/seq_file.h> 22 #include <linux/seq_file.h>
23 #include <linux/kernfs.h> 23 #include <linux/kernfs.h>
24 #include <linux/wait.h> 24 #include <linux/wait.h>
25 25
26 #ifdef CONFIG_CGROUPS 26 #ifdef CONFIG_CGROUPS
27 27
28 struct cgroup_root; 28 struct cgroup_root;
29 struct cgroup_subsys; 29 struct cgroup_subsys;
30 struct cgroup; 30 struct cgroup;
31 31
32 extern int cgroup_init_early(void); 32 extern int cgroup_init_early(void);
33 extern int cgroup_init(void); 33 extern int cgroup_init(void);
34 extern void cgroup_fork(struct task_struct *p); 34 extern void cgroup_fork(struct task_struct *p);
35 extern void cgroup_post_fork(struct task_struct *p); 35 extern void cgroup_post_fork(struct task_struct *p);
36 extern void cgroup_exit(struct task_struct *p); 36 extern void cgroup_exit(struct task_struct *p);
37 extern int cgroupstats_build(struct cgroupstats *stats, 37 extern int cgroupstats_build(struct cgroupstats *stats,
38 struct dentry *dentry); 38 struct dentry *dentry);
39 39
40 extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, 40 extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
41 struct pid *pid, struct task_struct *tsk); 41 struct pid *pid, struct task_struct *tsk);
42 42
43 /* define the enumeration of all cgroup subsystems */ 43 /* define the enumeration of all cgroup subsystems */
44 #define SUBSYS(_x) _x ## _cgrp_id, 44 #define SUBSYS(_x) _x ## _cgrp_id,
45 enum cgroup_subsys_id { 45 enum cgroup_subsys_id {
46 #include <linux/cgroup_subsys.h> 46 #include <linux/cgroup_subsys.h>
47 CGROUP_SUBSYS_COUNT, 47 CGROUP_SUBSYS_COUNT,
48 }; 48 };
49 #undef SUBSYS 49 #undef SUBSYS
50 50
51 /* 51 /*
52 * Per-subsystem/per-cgroup state maintained by the system. This is the 52 * Per-subsystem/per-cgroup state maintained by the system. This is the
53 * fundamental structural building block that controllers deal with. 53 * fundamental structural building block that controllers deal with.
54 * 54 *
55 * Fields marked with "PI:" are public and immutable and may be accessed 55 * Fields marked with "PI:" are public and immutable and may be accessed
56 * directly without synchronization. 56 * directly without synchronization.
57 */ 57 */
58 struct cgroup_subsys_state { 58 struct cgroup_subsys_state {
59 /* PI: the cgroup that this css is attached to */ 59 /* PI: the cgroup that this css is attached to */
60 struct cgroup *cgroup; 60 struct cgroup *cgroup;
61 61
62 /* PI: the cgroup subsystem that this css is attached to */ 62 /* PI: the cgroup subsystem that this css is attached to */
63 struct cgroup_subsys *ss; 63 struct cgroup_subsys *ss;
64 64
65 /* reference count - access via css_[try]get() and css_put() */ 65 /* reference count - access via css_[try]get() and css_put() */
66 struct percpu_ref refcnt; 66 struct percpu_ref refcnt;
67 67
68 /* PI: the parent css */ 68 /* PI: the parent css */
69 struct cgroup_subsys_state *parent; 69 struct cgroup_subsys_state *parent;
70 70
71 /* siblings list anchored at the parent's ->children */ 71 /* siblings list anchored at the parent's ->children */
72 struct list_head sibling; 72 struct list_head sibling;
73 struct list_head children; 73 struct list_head children;
74 74
75 /* 75 /*
76 * PI: Subsys-unique ID. 0 is unused and root is always 1. The 76 * PI: Subsys-unique ID. 0 is unused and root is always 1. The
77 * matching css can be looked up using css_from_id(). 77 * matching css can be looked up using css_from_id().
78 */ 78 */
79 int id; 79 int id;
80 80
81 unsigned int flags; 81 unsigned int flags;
82 82
83 /* 83 /*
84 * Monotonically increasing unique serial number which defines a 84 * Monotonically increasing unique serial number which defines a
85 * uniform order among all csses. It's guaranteed that all 85 * uniform order among all csses. It's guaranteed that all
86 * ->children lists are in the ascending order of ->serial_nr and 86 * ->children lists are in the ascending order of ->serial_nr and
87 * used to allow interrupting and resuming iterations. 87 * used to allow interrupting and resuming iterations.
88 */ 88 */
89 u64 serial_nr; 89 u64 serial_nr;
90 90
91 /* percpu_ref killing and RCU release */ 91 /* percpu_ref killing and RCU release */
92 struct rcu_head rcu_head; 92 struct rcu_head rcu_head;
93 struct work_struct destroy_work; 93 struct work_struct destroy_work;
94 }; 94 };
95 95
96 /* bits in struct cgroup_subsys_state flags field */ 96 /* bits in struct cgroup_subsys_state flags field */
97 enum { 97 enum {
98 CSS_NO_REF = (1 << 0), /* no reference counting for this css */ 98 CSS_NO_REF = (1 << 0), /* no reference counting for this css */
99 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ 99 CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */
100 CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ 100 CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */
101 }; 101 };
102 102
103 /** 103 /**
104 * css_get - obtain a reference on the specified css 104 * css_get - obtain a reference on the specified css
105 * @css: target css 105 * @css: target css
106 * 106 *
107 * The caller must already have a reference. 107 * The caller must already have a reference.
108 */ 108 */
109 static inline void css_get(struct cgroup_subsys_state *css) 109 static inline void css_get(struct cgroup_subsys_state *css)
110 { 110 {
111 if (!(css->flags & CSS_NO_REF)) 111 if (!(css->flags & CSS_NO_REF))
112 percpu_ref_get(&css->refcnt); 112 percpu_ref_get(&css->refcnt);
113 } 113 }
114 114
115 /** 115 /**
116 * css_get_many - obtain references on the specified css
117 * @css: target css
118 * @n: number of references to get
119 *
120 * The caller must already have a reference.
121 */
122 static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
123 {
124 if (!(css->flags & CSS_NO_REF))
125 percpu_ref_get_many(&css->refcnt, n);
126 }
127
128 /**
116 * css_tryget - try to obtain a reference on the specified css 129 * css_tryget - try to obtain a reference on the specified css
117 * @css: target css 130 * @css: target css
118 * 131 *
119 * Obtain a reference on @css unless it already has reached zero and is 132 * Obtain a reference on @css unless it already has reached zero and is
120 * being released. This function doesn't care whether @css is on or 133 * being released. This function doesn't care whether @css is on or
121 * offline. The caller naturally needs to ensure that @css is accessible 134 * offline. The caller naturally needs to ensure that @css is accessible
122 * but doesn't have to be holding a reference on it - IOW, RCU protected 135 * but doesn't have to be holding a reference on it - IOW, RCU protected
123 * access is good enough for this function. Returns %true if a reference 136 * access is good enough for this function. Returns %true if a reference
124 * count was successfully obtained; %false otherwise. 137 * count was successfully obtained; %false otherwise.
125 */ 138 */
126 static inline bool css_tryget(struct cgroup_subsys_state *css) 139 static inline bool css_tryget(struct cgroup_subsys_state *css)
127 { 140 {
128 if (!(css->flags & CSS_NO_REF)) 141 if (!(css->flags & CSS_NO_REF))
129 return percpu_ref_tryget(&css->refcnt); 142 return percpu_ref_tryget(&css->refcnt);
130 return true; 143 return true;
131 } 144 }
132 145
133 /** 146 /**
134 * css_tryget_online - try to obtain a reference on the specified css if online 147 * css_tryget_online - try to obtain a reference on the specified css if online
135 * @css: target css 148 * @css: target css
136 * 149 *
137 * Obtain a reference on @css if it's online. The caller naturally needs 150 * Obtain a reference on @css if it's online. The caller naturally needs
138 * to ensure that @css is accessible but doesn't have to be holding a 151 * to ensure that @css is accessible but doesn't have to be holding a
139 * reference on it - IOW, RCU protected access is good enough for this 152 * reference on it - IOW, RCU protected access is good enough for this
140 * function. Returns %true if a reference count was successfully obtained; 153 * function. Returns %true if a reference count was successfully obtained;
141 * %false otherwise. 154 * %false otherwise.
142 */ 155 */
143 static inline bool css_tryget_online(struct cgroup_subsys_state *css) 156 static inline bool css_tryget_online(struct cgroup_subsys_state *css)
144 { 157 {
145 if (!(css->flags & CSS_NO_REF)) 158 if (!(css->flags & CSS_NO_REF))
146 return percpu_ref_tryget_live(&css->refcnt); 159 return percpu_ref_tryget_live(&css->refcnt);
147 return true; 160 return true;
148 } 161 }
149 162
150 /** 163 /**
151 * css_put - put a css reference 164 * css_put - put a css reference
152 * @css: target css 165 * @css: target css
153 * 166 *
154 * Put a reference obtained via css_get() and css_tryget_online(). 167 * Put a reference obtained via css_get() and css_tryget_online().
155 */ 168 */
156 static inline void css_put(struct cgroup_subsys_state *css) 169 static inline void css_put(struct cgroup_subsys_state *css)
157 { 170 {
158 if (!(css->flags & CSS_NO_REF)) 171 if (!(css->flags & CSS_NO_REF))
159 percpu_ref_put(&css->refcnt); 172 percpu_ref_put(&css->refcnt);
173 }
174
175 /**
176 * css_put_many - put css references
177 * @css: target css
178 * @n: number of references to put
179 *
180 * Put references obtained via css_get() and css_tryget_online().
181 */
182 static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
183 {
184 if (!(css->flags & CSS_NO_REF))
185 percpu_ref_put_many(&css->refcnt, n);
160 } 186 }
161 187
162 /* bits in struct cgroup flags field */ 188 /* bits in struct cgroup flags field */
163 enum { 189 enum {
164 /* Control Group requires release notifications to userspace */ 190 /* Control Group requires release notifications to userspace */
165 CGRP_NOTIFY_ON_RELEASE, 191 CGRP_NOTIFY_ON_RELEASE,
166 /* 192 /*
167 * Clone the parent's configuration when creating a new child 193 * Clone the parent's configuration when creating a new child
168 * cpuset cgroup. For historical reasons, this option can be 194 * cpuset cgroup. For historical reasons, this option can be
169 * specified at mount time and thus is implemented here. 195 * specified at mount time and thus is implemented here.
170 */ 196 */
171 CGRP_CPUSET_CLONE_CHILDREN, 197 CGRP_CPUSET_CLONE_CHILDREN,
172 }; 198 };
173 199
174 struct cgroup { 200 struct cgroup {
175 /* self css with NULL ->ss, points back to this cgroup */ 201 /* self css with NULL ->ss, points back to this cgroup */
176 struct cgroup_subsys_state self; 202 struct cgroup_subsys_state self;
177 203
178 unsigned long flags; /* "unsigned long" so bitops work */ 204 unsigned long flags; /* "unsigned long" so bitops work */
179 205
180 /* 206 /*
181 * idr allocated in-hierarchy ID. 207 * idr allocated in-hierarchy ID.
182 * 208 *
183 * ID 0 is not used, the ID of the root cgroup is always 1, and a 209 * ID 0 is not used, the ID of the root cgroup is always 1, and a
184 * new cgroup will be assigned with a smallest available ID. 210 * new cgroup will be assigned with a smallest available ID.
185 * 211 *
186 * Allocating/Removing ID must be protected by cgroup_mutex. 212 * Allocating/Removing ID must be protected by cgroup_mutex.
187 */ 213 */
188 int id; 214 int id;
189 215
190 /* 216 /*
191 * If this cgroup contains any tasks, it contributes one to 217 * If this cgroup contains any tasks, it contributes one to
192 * populated_cnt. All children with non-zero popuplated_cnt of 218 * populated_cnt. All children with non-zero popuplated_cnt of
193 * their own contribute one. The count is zero iff there's no task 219 * their own contribute one. The count is zero iff there's no task
194 * in this cgroup or its subtree. 220 * in this cgroup or its subtree.
195 */ 221 */
196 int populated_cnt; 222 int populated_cnt;
197 223
198 struct kernfs_node *kn; /* cgroup kernfs entry */ 224 struct kernfs_node *kn; /* cgroup kernfs entry */
199 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ 225 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
200 226
201 /* 227 /*
202 * The bitmask of subsystems enabled on the child cgroups. 228 * The bitmask of subsystems enabled on the child cgroups.
203 * ->subtree_control is the one configured through 229 * ->subtree_control is the one configured through
204 * "cgroup.subtree_control" while ->child_subsys_mask is the 230 * "cgroup.subtree_control" while ->child_subsys_mask is the
205 * effective one which may have more subsystems enabled. 231 * effective one which may have more subsystems enabled.
206 * Controller knobs are made available iff it's enabled in 232 * Controller knobs are made available iff it's enabled in
207 * ->subtree_control. 233 * ->subtree_control.
208 */ 234 */
209 unsigned int subtree_control; 235 unsigned int subtree_control;
210 unsigned int child_subsys_mask; 236 unsigned int child_subsys_mask;
211 237
212 /* Private pointers for each registered subsystem */ 238 /* Private pointers for each registered subsystem */
213 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; 239 struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
214 240
215 struct cgroup_root *root; 241 struct cgroup_root *root;
216 242
217 /* 243 /*
218 * List of cgrp_cset_links pointing at css_sets with tasks in this 244 * List of cgrp_cset_links pointing at css_sets with tasks in this
219 * cgroup. Protected by css_set_lock. 245 * cgroup. Protected by css_set_lock.
220 */ 246 */
221 struct list_head cset_links; 247 struct list_head cset_links;
222 248
223 /* 249 /*
224 * On the default hierarchy, a css_set for a cgroup with some 250 * On the default hierarchy, a css_set for a cgroup with some
225 * susbsys disabled will point to css's which are associated with 251 * susbsys disabled will point to css's which are associated with
226 * the closest ancestor which has the subsys enabled. The 252 * the closest ancestor which has the subsys enabled. The
227 * following lists all css_sets which point to this cgroup's css 253 * following lists all css_sets which point to this cgroup's css
228 * for the given subsystem. 254 * for the given subsystem.
229 */ 255 */
230 struct list_head e_csets[CGROUP_SUBSYS_COUNT]; 256 struct list_head e_csets[CGROUP_SUBSYS_COUNT];
231 257
232 /* 258 /*
233 * list of pidlists, up to two for each namespace (one for procs, one 259 * list of pidlists, up to two for each namespace (one for procs, one
234 * for tasks); created on demand. 260 * for tasks); created on demand.
235 */ 261 */
236 struct list_head pidlists; 262 struct list_head pidlists;
237 struct mutex pidlist_mutex; 263 struct mutex pidlist_mutex;
238 264
239 /* used to wait for offlining of csses */ 265 /* used to wait for offlining of csses */
240 wait_queue_head_t offline_waitq; 266 wait_queue_head_t offline_waitq;
241 267
242 /* used to schedule release agent */ 268 /* used to schedule release agent */
243 struct work_struct release_agent_work; 269 struct work_struct release_agent_work;
244 }; 270 };
245 271
246 #define MAX_CGROUP_ROOT_NAMELEN 64 272 #define MAX_CGROUP_ROOT_NAMELEN 64
247 273
248 /* cgroup_root->flags */ 274 /* cgroup_root->flags */
249 enum { 275 enum {
250 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ 276 CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
251 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 277 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */
252 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 278 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */
253 }; 279 };
254 280
255 /* 281 /*
256 * A cgroup_root represents the root of a cgroup hierarchy, and may be 282 * A cgroup_root represents the root of a cgroup hierarchy, and may be
257 * associated with a kernfs_root to form an active hierarchy. This is 283 * associated with a kernfs_root to form an active hierarchy. This is
258 * internal to cgroup core. Don't access directly from controllers. 284 * internal to cgroup core. Don't access directly from controllers.
259 */ 285 */
260 struct cgroup_root { 286 struct cgroup_root {
261 struct kernfs_root *kf_root; 287 struct kernfs_root *kf_root;
262 288
263 /* The bitmask of subsystems attached to this hierarchy */ 289 /* The bitmask of subsystems attached to this hierarchy */
264 unsigned int subsys_mask; 290 unsigned int subsys_mask;
265 291
266 /* Unique id for this hierarchy. */ 292 /* Unique id for this hierarchy. */
267 int hierarchy_id; 293 int hierarchy_id;
268 294
269 /* The root cgroup. Root is destroyed on its release. */ 295 /* The root cgroup. Root is destroyed on its release. */
270 struct cgroup cgrp; 296 struct cgroup cgrp;
271 297
272 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ 298 /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
273 atomic_t nr_cgrps; 299 atomic_t nr_cgrps;
274 300
275 /* A list running through the active hierarchies */ 301 /* A list running through the active hierarchies */
276 struct list_head root_list; 302 struct list_head root_list;
277 303
278 /* Hierarchy-specific flags */ 304 /* Hierarchy-specific flags */
279 unsigned int flags; 305 unsigned int flags;
280 306
281 /* IDs for cgroups in this hierarchy */ 307 /* IDs for cgroups in this hierarchy */
282 struct idr cgroup_idr; 308 struct idr cgroup_idr;
283 309
284 /* The path to use for release notifications. */ 310 /* The path to use for release notifications. */
285 char release_agent_path[PATH_MAX]; 311 char release_agent_path[PATH_MAX];
286 312
287 /* The name for this hierarchy - may be empty */ 313 /* The name for this hierarchy - may be empty */
288 char name[MAX_CGROUP_ROOT_NAMELEN]; 314 char name[MAX_CGROUP_ROOT_NAMELEN];
289 }; 315 };
290 316
291 /* 317 /*
292 * A css_set is a structure holding pointers to a set of 318 * A css_set is a structure holding pointers to a set of
293 * cgroup_subsys_state objects. This saves space in the task struct 319 * cgroup_subsys_state objects. This saves space in the task struct
294 * object and speeds up fork()/exit(), since a single inc/dec and a 320 * object and speeds up fork()/exit(), since a single inc/dec and a
295 * list_add()/del() can bump the reference count on the entire cgroup 321 * list_add()/del() can bump the reference count on the entire cgroup
296 * set for a task. 322 * set for a task.
297 */ 323 */
298 324
299 struct css_set { 325 struct css_set {
300 326
301 /* Reference count */ 327 /* Reference count */
302 atomic_t refcount; 328 atomic_t refcount;
303 329
304 /* 330 /*
305 * List running through all cgroup groups in the same hash 331 * List running through all cgroup groups in the same hash
306 * slot. Protected by css_set_lock 332 * slot. Protected by css_set_lock
307 */ 333 */
308 struct hlist_node hlist; 334 struct hlist_node hlist;
309 335
310 /* 336 /*
311 * Lists running through all tasks using this cgroup group. 337 * Lists running through all tasks using this cgroup group.
312 * mg_tasks lists tasks which belong to this cset but are in the 338 * mg_tasks lists tasks which belong to this cset but are in the
313 * process of being migrated out or in. Protected by 339 * process of being migrated out or in. Protected by
314 * css_set_rwsem, but, during migration, once tasks are moved to 340 * css_set_rwsem, but, during migration, once tasks are moved to
315 * mg_tasks, it can be read safely while holding cgroup_mutex. 341 * mg_tasks, it can be read safely while holding cgroup_mutex.
316 */ 342 */
317 struct list_head tasks; 343 struct list_head tasks;
318 struct list_head mg_tasks; 344 struct list_head mg_tasks;
319 345
320 /* 346 /*
321 * List of cgrp_cset_links pointing at cgroups referenced from this 347 * List of cgrp_cset_links pointing at cgroups referenced from this
322 * css_set. Protected by css_set_lock. 348 * css_set. Protected by css_set_lock.
323 */ 349 */
324 struct list_head cgrp_links; 350 struct list_head cgrp_links;
325 351
326 /* the default cgroup associated with this css_set */ 352 /* the default cgroup associated with this css_set */
327 struct cgroup *dfl_cgrp; 353 struct cgroup *dfl_cgrp;
328 354
329 /* 355 /*
330 * Set of subsystem states, one for each subsystem. This array is 356 * Set of subsystem states, one for each subsystem. This array is
331 * immutable after creation apart from the init_css_set during 357 * immutable after creation apart from the init_css_set during
332 * subsystem registration (at boot time). 358 * subsystem registration (at boot time).
333 */ 359 */
334 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 360 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
335 361
336 /* 362 /*
337 * List of csets participating in the on-going migration either as 363 * List of csets participating in the on-going migration either as
338 * source or destination. Protected by cgroup_mutex. 364 * source or destination. Protected by cgroup_mutex.
339 */ 365 */
340 struct list_head mg_preload_node; 366 struct list_head mg_preload_node;
341 struct list_head mg_node; 367 struct list_head mg_node;
342 368
343 /* 369 /*
344 * If this cset is acting as the source of migration the following 370 * If this cset is acting as the source of migration the following
345 * two fields are set. mg_src_cgrp is the source cgroup of the 371 * two fields are set. mg_src_cgrp is the source cgroup of the
346 * on-going migration and mg_dst_cset is the destination cset the 372 * on-going migration and mg_dst_cset is the destination cset the
347 * target tasks on this cset should be migrated to. Protected by 373 * target tasks on this cset should be migrated to. Protected by
348 * cgroup_mutex. 374 * cgroup_mutex.
349 */ 375 */
350 struct cgroup *mg_src_cgrp; 376 struct cgroup *mg_src_cgrp;
351 struct css_set *mg_dst_cset; 377 struct css_set *mg_dst_cset;
352 378
353 /* 379 /*
354 * On the default hierarhcy, ->subsys[ssid] may point to a css 380 * On the default hierarhcy, ->subsys[ssid] may point to a css
355 * attached to an ancestor instead of the cgroup this css_set is 381 * attached to an ancestor instead of the cgroup this css_set is
356 * associated with. The following node is anchored at 382 * associated with. The following node is anchored at
357 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to 383 * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
358 * iterate through all css's attached to a given cgroup. 384 * iterate through all css's attached to a given cgroup.
359 */ 385 */
360 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; 386 struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
361 387
362 /* For RCU-protected deletion */ 388 /* For RCU-protected deletion */
363 struct rcu_head rcu_head; 389 struct rcu_head rcu_head;
364 }; 390 };
365 391
366 /* 392 /*
367 * struct cftype: handler definitions for cgroup control files 393 * struct cftype: handler definitions for cgroup control files
368 * 394 *
369 * When reading/writing to a file: 395 * When reading/writing to a file:
370 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata 396 * - the cgroup to use is file->f_dentry->d_parent->d_fsdata
371 * - the 'cftype' of the file is file->f_dentry->d_fsdata 397 * - the 'cftype' of the file is file->f_dentry->d_fsdata
372 */ 398 */
373 399
374 /* cftype->flags */ 400 /* cftype->flags */
375 enum { 401 enum {
376 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 402 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
377 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 403 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
378 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 404 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
379 405
380 /* internal flags, do not use outside cgroup core proper */ 406 /* internal flags, do not use outside cgroup core proper */
381 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ 407 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
382 __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ 408 __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
383 }; 409 };
384 410
385 #define MAX_CFTYPE_NAME 64 411 #define MAX_CFTYPE_NAME 64
386 412
387 struct cftype { 413 struct cftype {
388 /* 414 /*
389 * By convention, the name should begin with the name of the 415 * By convention, the name should begin with the name of the
390 * subsystem, followed by a period. Zero length string indicates 416 * subsystem, followed by a period. Zero length string indicates
391 * end of cftype array. 417 * end of cftype array.
392 */ 418 */
393 char name[MAX_CFTYPE_NAME]; 419 char name[MAX_CFTYPE_NAME];
394 int private; 420 int private;
395 /* 421 /*
396 * If not 0, file mode is set to this value, otherwise it will 422 * If not 0, file mode is set to this value, otherwise it will
397 * be figured out automatically 423 * be figured out automatically
398 */ 424 */
399 umode_t mode; 425 umode_t mode;
400 426
401 /* 427 /*
402 * The maximum length of string, excluding trailing nul, that can 428 * The maximum length of string, excluding trailing nul, that can
403 * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. 429 * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
404 */ 430 */
405 size_t max_write_len; 431 size_t max_write_len;
406 432
407 /* CFTYPE_* flags */ 433 /* CFTYPE_* flags */
408 unsigned int flags; 434 unsigned int flags;
409 435
410 /* 436 /*
411 * Fields used for internal bookkeeping. Initialized automatically 437 * Fields used for internal bookkeeping. Initialized automatically
412 * during registration. 438 * during registration.
413 */ 439 */
414 struct cgroup_subsys *ss; /* NULL for cgroup core files */ 440 struct cgroup_subsys *ss; /* NULL for cgroup core files */
415 struct list_head node; /* anchored at ss->cfts */ 441 struct list_head node; /* anchored at ss->cfts */
416 struct kernfs_ops *kf_ops; 442 struct kernfs_ops *kf_ops;
417 443
418 /* 444 /*
419 * read_u64() is a shortcut for the common case of returning a 445 * read_u64() is a shortcut for the common case of returning a
420 * single integer. Use it in place of read() 446 * single integer. Use it in place of read()
421 */ 447 */
422 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); 448 u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
423 /* 449 /*
424 * read_s64() is a signed version of read_u64() 450 * read_s64() is a signed version of read_u64()
425 */ 451 */
426 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); 452 s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
427 453
428 /* generic seq_file read interface */ 454 /* generic seq_file read interface */
429 int (*seq_show)(struct seq_file *sf, void *v); 455 int (*seq_show)(struct seq_file *sf, void *v);
430 456
431 /* optional ops, implement all or none */ 457 /* optional ops, implement all or none */
432 void *(*seq_start)(struct seq_file *sf, loff_t *ppos); 458 void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
433 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); 459 void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
434 void (*seq_stop)(struct seq_file *sf, void *v); 460 void (*seq_stop)(struct seq_file *sf, void *v);
435 461
436 /* 462 /*
437 * write_u64() is a shortcut for the common case of accepting 463 * write_u64() is a shortcut for the common case of accepting
438 * a single integer (as parsed by simple_strtoull) from 464 * a single integer (as parsed by simple_strtoull) from
439 * userspace. Use in place of write(); return 0 or error. 465 * userspace. Use in place of write(); return 0 or error.
440 */ 466 */
441 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, 467 int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
442 u64 val); 468 u64 val);
443 /* 469 /*
444 * write_s64() is a signed version of write_u64() 470 * write_s64() is a signed version of write_u64()
445 */ 471 */
446 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, 472 int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
447 s64 val); 473 s64 val);
448 474
449 /* 475 /*
450 * write() is the generic write callback which maps directly to 476 * write() is the generic write callback which maps directly to
451 * kernfs write operation and overrides all other operations. 477 * kernfs write operation and overrides all other operations.
452 * Maximum write size is determined by ->max_write_len. Use 478 * Maximum write size is determined by ->max_write_len. Use
453 * of_css/cft() to access the associated css and cft. 479 * of_css/cft() to access the associated css and cft.
454 */ 480 */
455 ssize_t (*write)(struct kernfs_open_file *of, 481 ssize_t (*write)(struct kernfs_open_file *of,
456 char *buf, size_t nbytes, loff_t off); 482 char *buf, size_t nbytes, loff_t off);
457 483
458 #ifdef CONFIG_DEBUG_LOCK_ALLOC 484 #ifdef CONFIG_DEBUG_LOCK_ALLOC
459 struct lock_class_key lockdep_key; 485 struct lock_class_key lockdep_key;
460 #endif 486 #endif
461 }; 487 };
462 488
463 extern struct cgroup_root cgrp_dfl_root; 489 extern struct cgroup_root cgrp_dfl_root;
464 extern struct css_set init_css_set; 490 extern struct css_set init_css_set;
465 491
466 /** 492 /**
467 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy 493 * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
468 * @cgrp: the cgroup of interest 494 * @cgrp: the cgroup of interest
469 * 495 *
470 * The default hierarchy is the v2 interface of cgroup and this function 496 * The default hierarchy is the v2 interface of cgroup and this function
471 * can be used to test whether a cgroup is on the default hierarchy for 497 * can be used to test whether a cgroup is on the default hierarchy for
472 * cases where a subsystem should behave differnetly depending on the 498 * cases where a subsystem should behave differnetly depending on the
473 * interface version. 499 * interface version.
474 * 500 *
475 * The set of behaviors which change on the default hierarchy are still 501 * The set of behaviors which change on the default hierarchy are still
476 * being determined and the mount option is prefixed with __DEVEL__. 502 * being determined and the mount option is prefixed with __DEVEL__.
477 * 503 *
478 * List of changed behaviors: 504 * List of changed behaviors:
479 * 505 *
480 * - Mount options "noprefix", "xattr", "clone_children", "release_agent" 506 * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
481 * and "name" are disallowed. 507 * and "name" are disallowed.
482 * 508 *
483 * - When mounting an existing superblock, mount options should match. 509 * - When mounting an existing superblock, mount options should match.
484 * 510 *
485 * - Remount is disallowed. 511 * - Remount is disallowed.
486 * 512 *
487 * - rename(2) is disallowed. 513 * - rename(2) is disallowed.
488 * 514 *
489 * - "tasks" is removed. Everything should be at process granularity. Use 515 * - "tasks" is removed. Everything should be at process granularity. Use
490 * "cgroup.procs" instead. 516 * "cgroup.procs" instead.
491 * 517 *
492 * - "cgroup.procs" is not sorted. pids will be unique unless they got 518 * - "cgroup.procs" is not sorted. pids will be unique unless they got
493 * recycled inbetween reads. 519 * recycled inbetween reads.
494 * 520 *
495 * - "release_agent" and "notify_on_release" are removed. Replacement 521 * - "release_agent" and "notify_on_release" are removed. Replacement
496 * notification mechanism will be implemented. 522 * notification mechanism will be implemented.
497 * 523 *
498 * - "cgroup.clone_children" is removed. 524 * - "cgroup.clone_children" is removed.
499 * 525 *
500 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup 526 * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
501 * and its descendants contain no task; otherwise, 1. The file also 527 * and its descendants contain no task; otherwise, 1. The file also
502 * generates kernfs notification which can be monitored through poll and 528 * generates kernfs notification which can be monitored through poll and
503 * [di]notify when the value of the file changes. 529 * [di]notify when the value of the file changes.
504 * 530 *
505 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and 531 * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
506 * take masks of ancestors with non-empty cpus/mems, instead of being 532 * take masks of ancestors with non-empty cpus/mems, instead of being
507 * moved to an ancestor. 533 * moved to an ancestor.
508 * 534 *
509 * - cpuset: a task can be moved into an empty cpuset, and again it takes 535 * - cpuset: a task can be moved into an empty cpuset, and again it takes
510 * masks of ancestors. 536 * masks of ancestors.
511 * 537 *
512 * - memcg: use_hierarchy is on by default and the cgroup file for the flag 538 * - memcg: use_hierarchy is on by default and the cgroup file for the flag
513 * is not created. 539 * is not created.
514 * 540 *
515 * - blkcg: blk-throttle becomes properly hierarchical. 541 * - blkcg: blk-throttle becomes properly hierarchical.
516 * 542 *
517 * - debug: disallowed on the default hierarchy. 543 * - debug: disallowed on the default hierarchy.
518 */ 544 */
519 static inline bool cgroup_on_dfl(const struct cgroup *cgrp) 545 static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
520 { 546 {
521 return cgrp->root == &cgrp_dfl_root; 547 return cgrp->root == &cgrp_dfl_root;
522 } 548 }
523 549
524 /* no synchronization, the result can only be used as a hint */ 550 /* no synchronization, the result can only be used as a hint */
525 static inline bool cgroup_has_tasks(struct cgroup *cgrp) 551 static inline bool cgroup_has_tasks(struct cgroup *cgrp)
526 { 552 {
527 return !list_empty(&cgrp->cset_links); 553 return !list_empty(&cgrp->cset_links);
528 } 554 }
529 555
530 /* returns ino associated with a cgroup */ 556 /* returns ino associated with a cgroup */
531 static inline ino_t cgroup_ino(struct cgroup *cgrp) 557 static inline ino_t cgroup_ino(struct cgroup *cgrp)
532 { 558 {
533 return cgrp->kn->ino; 559 return cgrp->kn->ino;
534 } 560 }
535 561
536 /* cft/css accessors for cftype->write() operation */ 562 /* cft/css accessors for cftype->write() operation */
537 static inline struct cftype *of_cft(struct kernfs_open_file *of) 563 static inline struct cftype *of_cft(struct kernfs_open_file *of)
538 { 564 {
539 return of->kn->priv; 565 return of->kn->priv;
540 } 566 }
541 567
542 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); 568 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
543 569
544 /* cft/css accessors for cftype->seq_*() operations */ 570 /* cft/css accessors for cftype->seq_*() operations */
545 static inline struct cftype *seq_cft(struct seq_file *seq) 571 static inline struct cftype *seq_cft(struct seq_file *seq)
546 { 572 {
547 return of_cft(seq->private); 573 return of_cft(seq->private);
548 } 574 }
549 575
550 static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) 576 static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
551 { 577 {
552 return of_css(seq->private); 578 return of_css(seq->private);
553 } 579 }
554 580
555 /* 581 /*
556 * Name / path handling functions. All are thin wrappers around the kernfs 582 * Name / path handling functions. All are thin wrappers around the kernfs
557 * counterparts and can be called under any context. 583 * counterparts and can be called under any context.
558 */ 584 */
559 585
560 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) 586 static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
561 { 587 {
562 return kernfs_name(cgrp->kn, buf, buflen); 588 return kernfs_name(cgrp->kn, buf, buflen);
563 } 589 }
564 590
565 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, 591 static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
566 size_t buflen) 592 size_t buflen)
567 { 593 {
568 return kernfs_path(cgrp->kn, buf, buflen); 594 return kernfs_path(cgrp->kn, buf, buflen);
569 } 595 }
570 596
571 static inline void pr_cont_cgroup_name(struct cgroup *cgrp) 597 static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
572 { 598 {
573 pr_cont_kernfs_name(cgrp->kn); 599 pr_cont_kernfs_name(cgrp->kn);
574 } 600 }
575 601
576 static inline void pr_cont_cgroup_path(struct cgroup *cgrp) 602 static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
577 { 603 {
578 pr_cont_kernfs_path(cgrp->kn); 604 pr_cont_kernfs_path(cgrp->kn);
579 } 605 }
580 606
581 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); 607 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
582 608
583 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 609 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
584 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 610 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
585 int cgroup_rm_cftypes(struct cftype *cfts); 611 int cgroup_rm_cftypes(struct cftype *cfts);
586 612
587 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); 613 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
588 614
589 /* 615 /*
590 * Control Group taskset, used to pass around set of tasks to cgroup_subsys 616 * Control Group taskset, used to pass around set of tasks to cgroup_subsys
591 * methods. 617 * methods.
592 */ 618 */
593 struct cgroup_taskset; 619 struct cgroup_taskset;
594 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 620 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
595 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 621 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
596 622
597 /** 623 /**
598 * cgroup_taskset_for_each - iterate cgroup_taskset 624 * cgroup_taskset_for_each - iterate cgroup_taskset
599 * @task: the loop cursor 625 * @task: the loop cursor
600 * @tset: taskset to iterate 626 * @tset: taskset to iterate
601 */ 627 */
602 #define cgroup_taskset_for_each(task, tset) \ 628 #define cgroup_taskset_for_each(task, tset) \
603 for ((task) = cgroup_taskset_first((tset)); (task); \ 629 for ((task) = cgroup_taskset_first((tset)); (task); \
604 (task) = cgroup_taskset_next((tset))) 630 (task) = cgroup_taskset_next((tset)))
605 631
606 /* 632 /*
607 * Control Group subsystem type. 633 * Control Group subsystem type.
608 * See Documentation/cgroups/cgroups.txt for details 634 * See Documentation/cgroups/cgroups.txt for details
609 */ 635 */
610 636
611 struct cgroup_subsys { 637 struct cgroup_subsys {
612 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); 638 struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
613 int (*css_online)(struct cgroup_subsys_state *css); 639 int (*css_online)(struct cgroup_subsys_state *css);
614 void (*css_offline)(struct cgroup_subsys_state *css); 640 void (*css_offline)(struct cgroup_subsys_state *css);
615 void (*css_free)(struct cgroup_subsys_state *css); 641 void (*css_free)(struct cgroup_subsys_state *css);
616 void (*css_reset)(struct cgroup_subsys_state *css); 642 void (*css_reset)(struct cgroup_subsys_state *css);
617 643
618 int (*can_attach)(struct cgroup_subsys_state *css, 644 int (*can_attach)(struct cgroup_subsys_state *css,
619 struct cgroup_taskset *tset); 645 struct cgroup_taskset *tset);
620 void (*cancel_attach)(struct cgroup_subsys_state *css, 646 void (*cancel_attach)(struct cgroup_subsys_state *css,
621 struct cgroup_taskset *tset); 647 struct cgroup_taskset *tset);
622 void (*attach)(struct cgroup_subsys_state *css, 648 void (*attach)(struct cgroup_subsys_state *css,
623 struct cgroup_taskset *tset); 649 struct cgroup_taskset *tset);
624 void (*fork)(struct task_struct *task); 650 void (*fork)(struct task_struct *task);
625 void (*exit)(struct cgroup_subsys_state *css, 651 void (*exit)(struct cgroup_subsys_state *css,
626 struct cgroup_subsys_state *old_css, 652 struct cgroup_subsys_state *old_css,
627 struct task_struct *task); 653 struct task_struct *task);
628 void (*bind)(struct cgroup_subsys_state *root_css); 654 void (*bind)(struct cgroup_subsys_state *root_css);
629 655
630 int disabled; 656 int disabled;
631 int early_init; 657 int early_init;
632 658
633 /* 659 /*
634 * If %false, this subsystem is properly hierarchical - 660 * If %false, this subsystem is properly hierarchical -
635 * configuration, resource accounting and restriction on a parent 661 * configuration, resource accounting and restriction on a parent
636 * cgroup cover those of its children. If %true, hierarchy support 662 * cgroup cover those of its children. If %true, hierarchy support
637 * is broken in some ways - some subsystems ignore hierarchy 663 * is broken in some ways - some subsystems ignore hierarchy
638 * completely while others are only implemented half-way. 664 * completely while others are only implemented half-way.
639 * 665 *
640 * It's now disallowed to create nested cgroups if the subsystem is 666 * It's now disallowed to create nested cgroups if the subsystem is
641 * broken and cgroup core will emit a warning message on such 667 * broken and cgroup core will emit a warning message on such
642 * cases. Eventually, all subsystems will be made properly 668 * cases. Eventually, all subsystems will be made properly
643 * hierarchical and this will go away. 669 * hierarchical and this will go away.
644 */ 670 */
645 bool broken_hierarchy; 671 bool broken_hierarchy;
646 bool warned_broken_hierarchy; 672 bool warned_broken_hierarchy;
647 673
648 /* the following two fields are initialized automtically during boot */ 674 /* the following two fields are initialized automtically during boot */
649 int id; 675 int id;
650 #define MAX_CGROUP_TYPE_NAMELEN 32 676 #define MAX_CGROUP_TYPE_NAMELEN 32
651 const char *name; 677 const char *name;
652 678
653 /* link to parent, protected by cgroup_lock() */ 679 /* link to parent, protected by cgroup_lock() */
654 struct cgroup_root *root; 680 struct cgroup_root *root;
655 681
656 /* idr for css->id */ 682 /* idr for css->id */
657 struct idr css_idr; 683 struct idr css_idr;
658 684
659 /* 685 /*
660 * List of cftypes. Each entry is the first entry of an array 686 * List of cftypes. Each entry is the first entry of an array
661 * terminated by zero length name. 687 * terminated by zero length name.
662 */ 688 */
663 struct list_head cfts; 689 struct list_head cfts;
664 690
665 /* 691 /*
666 * Base cftypes which are automatically registered. The two can 692 * Base cftypes which are automatically registered. The two can
667 * point to the same array. 693 * point to the same array.
668 */ 694 */
669 struct cftype *dfl_cftypes; /* for the default hierarchy */ 695 struct cftype *dfl_cftypes; /* for the default hierarchy */
670 struct cftype *legacy_cftypes; /* for the legacy hierarchies */ 696 struct cftype *legacy_cftypes; /* for the legacy hierarchies */
671 697
672 /* 698 /*
673 * A subsystem may depend on other subsystems. When such subsystem 699 * A subsystem may depend on other subsystems. When such subsystem
674 * is enabled on a cgroup, the depended-upon subsystems are enabled 700 * is enabled on a cgroup, the depended-upon subsystems are enabled
675 * together if available. Subsystems enabled due to dependency are 701 * together if available. Subsystems enabled due to dependency are
676 * not visible to userland until explicitly enabled. The following 702 * not visible to userland until explicitly enabled. The following
677 * specifies the mask of subsystems that this one depends on. 703 * specifies the mask of subsystems that this one depends on.
678 */ 704 */
679 unsigned int depends_on; 705 unsigned int depends_on;
680 }; 706 };
681 707
682 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; 708 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
683 #include <linux/cgroup_subsys.h> 709 #include <linux/cgroup_subsys.h>
684 #undef SUBSYS 710 #undef SUBSYS
685 711
686 /** 712 /**
687 * task_css_set_check - obtain a task's css_set with extra access conditions 713 * task_css_set_check - obtain a task's css_set with extra access conditions
688 * @task: the task to obtain css_set for 714 * @task: the task to obtain css_set for
689 * @__c: extra condition expression to be passed to rcu_dereference_check() 715 * @__c: extra condition expression to be passed to rcu_dereference_check()
690 * 716 *
691 * A task's css_set is RCU protected, initialized and exited while holding 717 * A task's css_set is RCU protected, initialized and exited while holding
692 * task_lock(), and can only be modified while holding both cgroup_mutex 718 * task_lock(), and can only be modified while holding both cgroup_mutex
693 * and task_lock() while the task is alive. This macro verifies that the 719 * and task_lock() while the task is alive. This macro verifies that the
694 * caller is inside proper critical section and returns @task's css_set. 720 * caller is inside proper critical section and returns @task's css_set.
695 * 721 *
696 * The caller can also specify additional allowed conditions via @__c, such 722 * The caller can also specify additional allowed conditions via @__c, such
697 * as locks used during the cgroup_subsys::attach() methods. 723 * as locks used during the cgroup_subsys::attach() methods.
698 */ 724 */
699 #ifdef CONFIG_PROVE_RCU 725 #ifdef CONFIG_PROVE_RCU
700 extern struct mutex cgroup_mutex; 726 extern struct mutex cgroup_mutex;
701 extern struct rw_semaphore css_set_rwsem; 727 extern struct rw_semaphore css_set_rwsem;
702 #define task_css_set_check(task, __c) \ 728 #define task_css_set_check(task, __c) \
703 rcu_dereference_check((task)->cgroups, \ 729 rcu_dereference_check((task)->cgroups, \
704 lockdep_is_held(&cgroup_mutex) || \ 730 lockdep_is_held(&cgroup_mutex) || \
705 lockdep_is_held(&css_set_rwsem) || \ 731 lockdep_is_held(&css_set_rwsem) || \
706 ((task)->flags & PF_EXITING) || (__c)) 732 ((task)->flags & PF_EXITING) || (__c))
707 #else 733 #else
708 #define task_css_set_check(task, __c) \ 734 #define task_css_set_check(task, __c) \
709 rcu_dereference((task)->cgroups) 735 rcu_dereference((task)->cgroups)
710 #endif 736 #endif
711 737
712 /** 738 /**
713 * task_css_check - obtain css for (task, subsys) w/ extra access conds 739 * task_css_check - obtain css for (task, subsys) w/ extra access conds
714 * @task: the target task 740 * @task: the target task
715 * @subsys_id: the target subsystem ID 741 * @subsys_id: the target subsystem ID
716 * @__c: extra condition expression to be passed to rcu_dereference_check() 742 * @__c: extra condition expression to be passed to rcu_dereference_check()
717 * 743 *
718 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The 744 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
719 * synchronization rules are the same as task_css_set_check(). 745 * synchronization rules are the same as task_css_set_check().
720 */ 746 */
721 #define task_css_check(task, subsys_id, __c) \ 747 #define task_css_check(task, subsys_id, __c) \
722 task_css_set_check((task), (__c))->subsys[(subsys_id)] 748 task_css_set_check((task), (__c))->subsys[(subsys_id)]
723 749
724 /** 750 /**
725 * task_css_set - obtain a task's css_set 751 * task_css_set - obtain a task's css_set
726 * @task: the task to obtain css_set for 752 * @task: the task to obtain css_set for
727 * 753 *
728 * See task_css_set_check(). 754 * See task_css_set_check().
729 */ 755 */
730 static inline struct css_set *task_css_set(struct task_struct *task) 756 static inline struct css_set *task_css_set(struct task_struct *task)
731 { 757 {
732 return task_css_set_check(task, false); 758 return task_css_set_check(task, false);
733 } 759 }
734 760
735 /** 761 /**
736 * task_css - obtain css for (task, subsys) 762 * task_css - obtain css for (task, subsys)
737 * @task: the target task 763 * @task: the target task
738 * @subsys_id: the target subsystem ID 764 * @subsys_id: the target subsystem ID
739 * 765 *
740 * See task_css_check(). 766 * See task_css_check().
741 */ 767 */
742 static inline struct cgroup_subsys_state *task_css(struct task_struct *task, 768 static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
743 int subsys_id) 769 int subsys_id)
744 { 770 {
745 return task_css_check(task, subsys_id, false); 771 return task_css_check(task, subsys_id, false);
746 } 772 }
747 773
748 /** 774 /**
749 * task_css_is_root - test whether a task belongs to the root css 775 * task_css_is_root - test whether a task belongs to the root css
750 * @task: the target task 776 * @task: the target task
751 * @subsys_id: the target subsystem ID 777 * @subsys_id: the target subsystem ID
752 * 778 *
753 * Test whether @task belongs to the root css on the specified subsystem. 779 * Test whether @task belongs to the root css on the specified subsystem.
754 * May be invoked in any context. 780 * May be invoked in any context.
755 */ 781 */
756 static inline bool task_css_is_root(struct task_struct *task, int subsys_id) 782 static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
757 { 783 {
758 return task_css_check(task, subsys_id, true) == 784 return task_css_check(task, subsys_id, true) ==
759 init_css_set.subsys[subsys_id]; 785 init_css_set.subsys[subsys_id];
760 } 786 }
761 787
762 static inline struct cgroup *task_cgroup(struct task_struct *task, 788 static inline struct cgroup *task_cgroup(struct task_struct *task,
763 int subsys_id) 789 int subsys_id)
764 { 790 {
765 return task_css(task, subsys_id)->cgroup; 791 return task_css(task, subsys_id)->cgroup;
766 } 792 }
767 793
768 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, 794 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
769 struct cgroup_subsys_state *parent); 795 struct cgroup_subsys_state *parent);
770 796
771 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); 797 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
772 798
773 /** 799 /**
774 * css_for_each_child - iterate through children of a css 800 * css_for_each_child - iterate through children of a css
775 * @pos: the css * to use as the loop cursor 801 * @pos: the css * to use as the loop cursor
776 * @parent: css whose children to walk 802 * @parent: css whose children to walk
777 * 803 *
778 * Walk @parent's children. Must be called under rcu_read_lock(). 804 * Walk @parent's children. Must be called under rcu_read_lock().
779 * 805 *
780 * If a subsystem synchronizes ->css_online() and the start of iteration, a 806 * If a subsystem synchronizes ->css_online() and the start of iteration, a
781 * css which finished ->css_online() is guaranteed to be visible in the 807 * css which finished ->css_online() is guaranteed to be visible in the
782 * future iterations and will stay visible until the last reference is put. 808 * future iterations and will stay visible until the last reference is put.
783 * A css which hasn't finished ->css_online() or already finished 809 * A css which hasn't finished ->css_online() or already finished
784 * ->css_offline() may show up during traversal. It's each subsystem's 810 * ->css_offline() may show up during traversal. It's each subsystem's
785 * responsibility to synchronize against on/offlining. 811 * responsibility to synchronize against on/offlining.
786 * 812 *
787 * It is allowed to temporarily drop RCU read lock during iteration. The 813 * It is allowed to temporarily drop RCU read lock during iteration. The
788 * caller is responsible for ensuring that @pos remains accessible until 814 * caller is responsible for ensuring that @pos remains accessible until
789 * the start of the next iteration by, for example, bumping the css refcnt. 815 * the start of the next iteration by, for example, bumping the css refcnt.
790 */ 816 */
791 #define css_for_each_child(pos, parent) \ 817 #define css_for_each_child(pos, parent) \
792 for ((pos) = css_next_child(NULL, (parent)); (pos); \ 818 for ((pos) = css_next_child(NULL, (parent)); (pos); \
793 (pos) = css_next_child((pos), (parent))) 819 (pos) = css_next_child((pos), (parent)))
794 820
795 struct cgroup_subsys_state * 821 struct cgroup_subsys_state *
796 css_next_descendant_pre(struct cgroup_subsys_state *pos, 822 css_next_descendant_pre(struct cgroup_subsys_state *pos,
797 struct cgroup_subsys_state *css); 823 struct cgroup_subsys_state *css);
798 824
799 struct cgroup_subsys_state * 825 struct cgroup_subsys_state *
800 css_rightmost_descendant(struct cgroup_subsys_state *pos); 826 css_rightmost_descendant(struct cgroup_subsys_state *pos);
801 827
802 /** 828 /**
803 * css_for_each_descendant_pre - pre-order walk of a css's descendants 829 * css_for_each_descendant_pre - pre-order walk of a css's descendants
804 * @pos: the css * to use as the loop cursor 830 * @pos: the css * to use as the loop cursor
805 * @root: css whose descendants to walk 831 * @root: css whose descendants to walk
806 * 832 *
807 * Walk @root's descendants. @root is included in the iteration and the 833 * Walk @root's descendants. @root is included in the iteration and the
808 * first node to be visited. Must be called under rcu_read_lock(). 834 * first node to be visited. Must be called under rcu_read_lock().
809 * 835 *
810 * If a subsystem synchronizes ->css_online() and the start of iteration, a 836 * If a subsystem synchronizes ->css_online() and the start of iteration, a
811 * css which finished ->css_online() is guaranteed to be visible in the 837 * css which finished ->css_online() is guaranteed to be visible in the
812 * future iterations and will stay visible until the last reference is put. 838 * future iterations and will stay visible until the last reference is put.
813 * A css which hasn't finished ->css_online() or already finished 839 * A css which hasn't finished ->css_online() or already finished
814 * ->css_offline() may show up during traversal. It's each subsystem's 840 * ->css_offline() may show up during traversal. It's each subsystem's
815 * responsibility to synchronize against on/offlining. 841 * responsibility to synchronize against on/offlining.
816 * 842 *
817 * For example, the following guarantees that a descendant can't escape 843 * For example, the following guarantees that a descendant can't escape
818 * state updates of its ancestors. 844 * state updates of its ancestors.
819 * 845 *
820 * my_online(@css) 846 * my_online(@css)
821 * { 847 * {
822 * Lock @css's parent and @css; 848 * Lock @css's parent and @css;
823 * Inherit state from the parent; 849 * Inherit state from the parent;
824 * Unlock both. 850 * Unlock both.
825 * } 851 * }
826 * 852 *
827 * my_update_state(@css) 853 * my_update_state(@css)
828 * { 854 * {
829 * css_for_each_descendant_pre(@pos, @css) { 855 * css_for_each_descendant_pre(@pos, @css) {
830 * Lock @pos; 856 * Lock @pos;
831 * if (@pos == @css) 857 * if (@pos == @css)
832 * Update @css's state; 858 * Update @css's state;
833 * else 859 * else
834 * Verify @pos is alive and inherit state from its parent; 860 * Verify @pos is alive and inherit state from its parent;
835 * Unlock @pos; 861 * Unlock @pos;
836 * } 862 * }
837 * } 863 * }
838 * 864 *
839 * As long as the inheriting step, including checking the parent state, is 865 * As long as the inheriting step, including checking the parent state, is
840 * enclosed inside @pos locking, double-locking the parent isn't necessary 866 * enclosed inside @pos locking, double-locking the parent isn't necessary
841 * while inheriting. The state update to the parent is guaranteed to be 867 * while inheriting. The state update to the parent is guaranteed to be
842 * visible by walking order and, as long as inheriting operations to the 868 * visible by walking order and, as long as inheriting operations to the
843 * same @pos are atomic to each other, multiple updates racing each other 869 * same @pos are atomic to each other, multiple updates racing each other
844 * still result in the correct state. It's guaranateed that at least one 870 * still result in the correct state. It's guaranateed that at least one
845 * inheritance happens for any css after the latest update to its parent. 871 * inheritance happens for any css after the latest update to its parent.
846 * 872 *
847 * If checking parent's state requires locking the parent, each inheriting 873 * If checking parent's state requires locking the parent, each inheriting
848 * iteration should lock and unlock both @pos->parent and @pos. 874 * iteration should lock and unlock both @pos->parent and @pos.
849 * 875 *
850 * Alternatively, a subsystem may choose to use a single global lock to 876 * Alternatively, a subsystem may choose to use a single global lock to
851 * synchronize ->css_online() and ->css_offline() against tree-walking 877 * synchronize ->css_online() and ->css_offline() against tree-walking
852 * operations. 878 * operations.
853 * 879 *
854 * It is allowed to temporarily drop RCU read lock during iteration. The 880 * It is allowed to temporarily drop RCU read lock during iteration. The
855 * caller is responsible for ensuring that @pos remains accessible until 881 * caller is responsible for ensuring that @pos remains accessible until
856 * the start of the next iteration by, for example, bumping the css refcnt. 882 * the start of the next iteration by, for example, bumping the css refcnt.
857 */ 883 */
858 #define css_for_each_descendant_pre(pos, css) \ 884 #define css_for_each_descendant_pre(pos, css) \
859 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ 885 for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \
860 (pos) = css_next_descendant_pre((pos), (css))) 886 (pos) = css_next_descendant_pre((pos), (css)))
861 887
862 struct cgroup_subsys_state * 888 struct cgroup_subsys_state *
863 css_next_descendant_post(struct cgroup_subsys_state *pos, 889 css_next_descendant_post(struct cgroup_subsys_state *pos,
864 struct cgroup_subsys_state *css); 890 struct cgroup_subsys_state *css);
865 891
866 /** 892 /**
867 * css_for_each_descendant_post - post-order walk of a css's descendants 893 * css_for_each_descendant_post - post-order walk of a css's descendants
868 * @pos: the css * to use as the loop cursor 894 * @pos: the css * to use as the loop cursor
869 * @css: css whose descendants to walk 895 * @css: css whose descendants to walk
870 * 896 *
871 * Similar to css_for_each_descendant_pre() but performs post-order 897 * Similar to css_for_each_descendant_pre() but performs post-order
872 * traversal instead. @root is included in the iteration and the last 898 * traversal instead. @root is included in the iteration and the last
873 * node to be visited. 899 * node to be visited.
874 * 900 *
875 * If a subsystem synchronizes ->css_online() and the start of iteration, a 901 * If a subsystem synchronizes ->css_online() and the start of iteration, a
876 * css which finished ->css_online() is guaranteed to be visible in the 902 * css which finished ->css_online() is guaranteed to be visible in the
877 * future iterations and will stay visible until the last reference is put. 903 * future iterations and will stay visible until the last reference is put.
878 * A css which hasn't finished ->css_online() or already finished 904 * A css which hasn't finished ->css_online() or already finished
879 * ->css_offline() may show up during traversal. It's each subsystem's 905 * ->css_offline() may show up during traversal. It's each subsystem's
880 * responsibility to synchronize against on/offlining. 906 * responsibility to synchronize against on/offlining.
881 * 907 *
882 * Note that the walk visibility guarantee example described in pre-order 908 * Note that the walk visibility guarantee example described in pre-order
883 * walk doesn't apply the same to post-order walks. 909 * walk doesn't apply the same to post-order walks.
884 */ 910 */
885 #define css_for_each_descendant_post(pos, css) \ 911 #define css_for_each_descendant_post(pos, css) \
886 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ 912 for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \
887 (pos) = css_next_descendant_post((pos), (css))) 913 (pos) = css_next_descendant_post((pos), (css)))
888 914
889 bool css_has_online_children(struct cgroup_subsys_state *css); 915 bool css_has_online_children(struct cgroup_subsys_state *css);
890 916
891 /* A css_task_iter should be treated as an opaque object */ 917 /* A css_task_iter should be treated as an opaque object */
892 struct css_task_iter { 918 struct css_task_iter {
893 struct cgroup_subsys *ss; 919 struct cgroup_subsys *ss;
894 920
895 struct list_head *cset_pos; 921 struct list_head *cset_pos;
896 struct list_head *cset_head; 922 struct list_head *cset_head;
897 923
898 struct list_head *task_pos; 924 struct list_head *task_pos;
899 struct list_head *tasks_head; 925 struct list_head *tasks_head;
900 struct list_head *mg_tasks_head; 926 struct list_head *mg_tasks_head;
901 }; 927 };
902 928
903 void css_task_iter_start(struct cgroup_subsys_state *css, 929 void css_task_iter_start(struct cgroup_subsys_state *css,
904 struct css_task_iter *it); 930 struct css_task_iter *it);
905 struct task_struct *css_task_iter_next(struct css_task_iter *it); 931 struct task_struct *css_task_iter_next(struct css_task_iter *it);
906 void css_task_iter_end(struct css_task_iter *it); 932 void css_task_iter_end(struct css_task_iter *it);
907 933
908 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); 934 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
909 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); 935 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
910 936
911 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, 937 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
912 struct cgroup_subsys *ss); 938 struct cgroup_subsys *ss);
913 939
914 #else /* !CONFIG_CGROUPS */ 940 #else /* !CONFIG_CGROUPS */
915 941
916 static inline int cgroup_init_early(void) { return 0; } 942 static inline int cgroup_init_early(void) { return 0; }
917 static inline int cgroup_init(void) { return 0; } 943 static inline int cgroup_init(void) { return 0; }
918 static inline void cgroup_fork(struct task_struct *p) {} 944 static inline void cgroup_fork(struct task_struct *p) {}
919 static inline void cgroup_post_fork(struct task_struct *p) {} 945 static inline void cgroup_post_fork(struct task_struct *p) {}
920 static inline void cgroup_exit(struct task_struct *p) {} 946 static inline void cgroup_exit(struct task_struct *p) {}
921 947
922 static inline int cgroupstats_build(struct cgroupstats *stats, 948 static inline int cgroupstats_build(struct cgroupstats *stats,
923 struct dentry *dentry) 949 struct dentry *dentry)
924 { 950 {
925 return -EINVAL; 951 return -EINVAL;
926 } 952 }
927 953
928 /* No cgroups - nothing to do */ 954 /* No cgroups - nothing to do */
929 static inline int cgroup_attach_task_all(struct task_struct *from, 955 static inline int cgroup_attach_task_all(struct task_struct *from,
930 struct task_struct *t) 956 struct task_struct *t)
931 { 957 {
932 return 0; 958 return 0;
933 } 959 }
934 960
935 #endif /* !CONFIG_CGROUPS */ 961 #endif /* !CONFIG_CGROUPS */
936 962
937 #endif /* _LINUX_CGROUP_H */ 963 #endif /* _LINUX_CGROUP_H */
938 964
include/linux/percpu-refcount.h
1 /* 1 /*
2 * Percpu refcounts: 2 * Percpu refcounts:
3 * (C) 2012 Google, Inc. 3 * (C) 2012 Google, Inc.
4 * Author: Kent Overstreet <koverstreet@google.com> 4 * Author: Kent Overstreet <koverstreet@google.com>
5 * 5 *
6 * This implements a refcount with similar semantics to atomic_t - atomic_inc(), 6 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
7 * atomic_dec_and_test() - but percpu. 7 * atomic_dec_and_test() - but percpu.
8 * 8 *
9 * There's one important difference between percpu refs and normal atomic_t 9 * There's one important difference between percpu refs and normal atomic_t
10 * refcounts; you have to keep track of your initial refcount, and then when you 10 * refcounts; you have to keep track of your initial refcount, and then when you
11 * start shutting down you call percpu_ref_kill() _before_ dropping the initial 11 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
12 * refcount. 12 * refcount.
13 * 13 *
14 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less 14 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
15 * than an atomic_t - this is because of the way shutdown works, see 15 * than an atomic_t - this is because of the way shutdown works, see
16 * percpu_ref_kill()/PERCPU_COUNT_BIAS. 16 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
17 * 17 *
18 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the 18 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
19 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() 19 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
20 * puts the ref back in single atomic_t mode, collecting the per cpu refs and 20 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
21 * issuing the appropriate barriers, and then marks the ref as shutting down so 21 * issuing the appropriate barriers, and then marks the ref as shutting down so
22 * that percpu_ref_put() will check for the ref hitting 0. After it returns, 22 * that percpu_ref_put() will check for the ref hitting 0. After it returns,
23 * it's safe to drop the initial ref. 23 * it's safe to drop the initial ref.
24 * 24 *
25 * USAGE: 25 * USAGE:
26 * 26 *
27 * See fs/aio.c for some example usage; it's used there for struct kioctx, which 27 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
28 * is created when userspaces calls io_setup(), and destroyed when userspace 28 * is created when userspaces calls io_setup(), and destroyed when userspace
29 * calls io_destroy() or the process exits. 29 * calls io_destroy() or the process exits.
30 * 30 *
31 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it 31 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
32 * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove 32 * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove
33 * the kioctx from the proccess's list of kioctxs - after that, there can't be 33 * the kioctx from the proccess's list of kioctxs - after that, there can't be
34 * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop 34 * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
35 * the initial ref with percpu_ref_put(). 35 * the initial ref with percpu_ref_put().
36 * 36 *
37 * Code that does a two stage shutdown like this often needs some kind of 37 * Code that does a two stage shutdown like this often needs some kind of
38 * explicit synchronization to ensure the initial refcount can only be dropped 38 * explicit synchronization to ensure the initial refcount can only be dropped
39 * once - percpu_ref_kill() does this for you, it returns true once and false if 39 * once - percpu_ref_kill() does this for you, it returns true once and false if
40 * someone else already called it. The aio code uses it this way, but it's not 40 * someone else already called it. The aio code uses it this way, but it's not
41 * necessary if the code has some other mechanism to synchronize teardown. 41 * necessary if the code has some other mechanism to synchronize teardown.
42 * around. 42 * around.
43 */ 43 */
44 44
45 #ifndef _LINUX_PERCPU_REFCOUNT_H 45 #ifndef _LINUX_PERCPU_REFCOUNT_H
46 #define _LINUX_PERCPU_REFCOUNT_H 46 #define _LINUX_PERCPU_REFCOUNT_H
47 47
48 #include <linux/atomic.h> 48 #include <linux/atomic.h>
49 #include <linux/kernel.h> 49 #include <linux/kernel.h>
50 #include <linux/percpu.h> 50 #include <linux/percpu.h>
51 #include <linux/rcupdate.h> 51 #include <linux/rcupdate.h>
52 #include <linux/gfp.h> 52 #include <linux/gfp.h>
53 53
54 struct percpu_ref; 54 struct percpu_ref;
55 typedef void (percpu_ref_func_t)(struct percpu_ref *); 55 typedef void (percpu_ref_func_t)(struct percpu_ref *);
56 56
57 /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ 57 /* flags set in the lower bits of percpu_ref->percpu_count_ptr */
58 enum { 58 enum {
59 __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ 59 __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */
60 __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ 60 __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */
61 __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, 61 __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,
62 62
63 __PERCPU_REF_FLAG_BITS = 2, 63 __PERCPU_REF_FLAG_BITS = 2,
64 }; 64 };
65 65
66 /* @flags for percpu_ref_init() */ 66 /* @flags for percpu_ref_init() */
67 enum { 67 enum {
68 /* 68 /*
69 * Start w/ ref == 1 in atomic mode. Can be switched to percpu 69 * Start w/ ref == 1 in atomic mode. Can be switched to percpu
70 * operation using percpu_ref_switch_to_percpu(). If initialized 70 * operation using percpu_ref_switch_to_percpu(). If initialized
71 * with this flag, the ref will stay in atomic mode until 71 * with this flag, the ref will stay in atomic mode until
72 * percpu_ref_switch_to_percpu() is invoked on it. 72 * percpu_ref_switch_to_percpu() is invoked on it.
73 */ 73 */
74 PERCPU_REF_INIT_ATOMIC = 1 << 0, 74 PERCPU_REF_INIT_ATOMIC = 1 << 0,
75 75
76 /* 76 /*
77 * Start dead w/ ref == 0 in atomic mode. Must be revived with 77 * Start dead w/ ref == 0 in atomic mode. Must be revived with
78 * percpu_ref_reinit() before used. Implies INIT_ATOMIC. 78 * percpu_ref_reinit() before used. Implies INIT_ATOMIC.
79 */ 79 */
80 PERCPU_REF_INIT_DEAD = 1 << 1, 80 PERCPU_REF_INIT_DEAD = 1 << 1,
81 }; 81 };
82 82
83 struct percpu_ref { 83 struct percpu_ref {
84 atomic_long_t count; 84 atomic_long_t count;
85 /* 85 /*
86 * The low bit of the pointer indicates whether the ref is in percpu 86 * The low bit of the pointer indicates whether the ref is in percpu
87 * mode; if set, then get/put will manipulate the atomic_t. 87 * mode; if set, then get/put will manipulate the atomic_t.
88 */ 88 */
89 unsigned long percpu_count_ptr; 89 unsigned long percpu_count_ptr;
90 percpu_ref_func_t *release; 90 percpu_ref_func_t *release;
91 percpu_ref_func_t *confirm_switch; 91 percpu_ref_func_t *confirm_switch;
92 bool force_atomic:1; 92 bool force_atomic:1;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 }; 94 };
95 95
96 int __must_check percpu_ref_init(struct percpu_ref *ref, 96 int __must_check percpu_ref_init(struct percpu_ref *ref,
97 percpu_ref_func_t *release, unsigned int flags, 97 percpu_ref_func_t *release, unsigned int flags,
98 gfp_t gfp); 98 gfp_t gfp);
99 void percpu_ref_exit(struct percpu_ref *ref); 99 void percpu_ref_exit(struct percpu_ref *ref);
100 void percpu_ref_switch_to_atomic(struct percpu_ref *ref, 100 void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
101 percpu_ref_func_t *confirm_switch); 101 percpu_ref_func_t *confirm_switch);
102 void percpu_ref_switch_to_percpu(struct percpu_ref *ref); 102 void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
103 void percpu_ref_kill_and_confirm(struct percpu_ref *ref, 103 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
104 percpu_ref_func_t *confirm_kill); 104 percpu_ref_func_t *confirm_kill);
105 void percpu_ref_reinit(struct percpu_ref *ref); 105 void percpu_ref_reinit(struct percpu_ref *ref);
106 106
107 /** 107 /**
108 * percpu_ref_kill - drop the initial ref 108 * percpu_ref_kill - drop the initial ref
109 * @ref: percpu_ref to kill 109 * @ref: percpu_ref to kill
110 * 110 *
111 * Must be used to drop the initial ref on a percpu refcount; must be called 111 * Must be used to drop the initial ref on a percpu refcount; must be called
112 * precisely once before shutdown. 112 * precisely once before shutdown.
113 * 113 *
114 * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the 114 * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
115 * percpu counters and dropping the initial ref. 115 * percpu counters and dropping the initial ref.
116 */ 116 */
117 static inline void percpu_ref_kill(struct percpu_ref *ref) 117 static inline void percpu_ref_kill(struct percpu_ref *ref)
118 { 118 {
119 return percpu_ref_kill_and_confirm(ref, NULL); 119 return percpu_ref_kill_and_confirm(ref, NULL);
120 } 120 }
121 121
122 /* 122 /*
123 * Internal helper. Don't use outside percpu-refcount proper. The 123 * Internal helper. Don't use outside percpu-refcount proper. The
124 * function doesn't return the pointer and let the caller test it for NULL 124 * function doesn't return the pointer and let the caller test it for NULL
125 * because doing so forces the compiler to generate two conditional 125 * because doing so forces the compiler to generate two conditional
126 * branches as it can't assume that @ref->percpu_count is not NULL. 126 * branches as it can't assume that @ref->percpu_count is not NULL.
127 */ 127 */
128 static inline bool __ref_is_percpu(struct percpu_ref *ref, 128 static inline bool __ref_is_percpu(struct percpu_ref *ref,
129 unsigned long __percpu **percpu_countp) 129 unsigned long __percpu **percpu_countp)
130 { 130 {
131 unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); 131 unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr);
132 132
133 /* paired with smp_store_release() in percpu_ref_reinit() */ 133 /* paired with smp_store_release() in percpu_ref_reinit() */
134 smp_read_barrier_depends(); 134 smp_read_barrier_depends();
135 135
136 /* 136 /*
137 * Theoretically, the following could test just ATOMIC; however, 137 * Theoretically, the following could test just ATOMIC; however,
138 * then we'd have to mask off DEAD separately as DEAD may be 138 * then we'd have to mask off DEAD separately as DEAD may be
139 * visible without ATOMIC if we race with percpu_ref_kill(). DEAD 139 * visible without ATOMIC if we race with percpu_ref_kill(). DEAD
140 * implies ATOMIC anyway. Test them together. 140 * implies ATOMIC anyway. Test them together.
141 */ 141 */
142 if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) 142 if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
143 return false; 143 return false;
144 144
145 *percpu_countp = (unsigned long __percpu *)percpu_ptr; 145 *percpu_countp = (unsigned long __percpu *)percpu_ptr;
146 return true; 146 return true;
147 } 147 }
148 148
149 /** 149 /**
150 * percpu_ref_get - increment a percpu refcount 150 * percpu_ref_get_many - increment a percpu refcount
151 * @ref: percpu_ref to get 151 * @ref: percpu_ref to get
152 * @nr: number of references to get
152 * 153 *
153 * Analagous to atomic_long_inc(). 154 * Analogous to atomic_long_add().
154 * 155 *
155 * This function is safe to call as long as @ref is between init and exit. 156 * This function is safe to call as long as @ref is between init and exit.
156 */ 157 */
157 static inline void percpu_ref_get(struct percpu_ref *ref) 158 static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
158 { 159 {
159 unsigned long __percpu *percpu_count; 160 unsigned long __percpu *percpu_count;
160 161
161 rcu_read_lock_sched(); 162 rcu_read_lock_sched();
162 163
163 if (__ref_is_percpu(ref, &percpu_count)) 164 if (__ref_is_percpu(ref, &percpu_count))
164 this_cpu_inc(*percpu_count); 165 this_cpu_add(*percpu_count, nr);
165 else 166 else
166 atomic_long_inc(&ref->count); 167 atomic_long_add(nr, &ref->count);
167 168
168 rcu_read_unlock_sched(); 169 rcu_read_unlock_sched();
169 } 170 }
170 171
171 /** 172 /**
173 * percpu_ref_get - increment a percpu refcount
174 * @ref: percpu_ref to get
175 *
176 * Analagous to atomic_long_inc().
177 *
178 * This function is safe to call as long as @ref is between init and exit.
179 */
180 static inline void percpu_ref_get(struct percpu_ref *ref)
181 {
182 percpu_ref_get_many(ref, 1);
183 }
184
185 /**
172 * percpu_ref_tryget - try to increment a percpu refcount 186 * percpu_ref_tryget - try to increment a percpu refcount
173 * @ref: percpu_ref to try-get 187 * @ref: percpu_ref to try-get
174 * 188 *
175 * Increment a percpu refcount unless its count already reached zero. 189 * Increment a percpu refcount unless its count already reached zero.
176 * Returns %true on success; %false on failure. 190 * Returns %true on success; %false on failure.
177 * 191 *
178 * This function is safe to call as long as @ref is between init and exit. 192 * This function is safe to call as long as @ref is between init and exit.
179 */ 193 */
180 static inline bool percpu_ref_tryget(struct percpu_ref *ref) 194 static inline bool percpu_ref_tryget(struct percpu_ref *ref)
181 { 195 {
182 unsigned long __percpu *percpu_count; 196 unsigned long __percpu *percpu_count;
183 int ret; 197 int ret;
184 198
185 rcu_read_lock_sched(); 199 rcu_read_lock_sched();
186 200
187 if (__ref_is_percpu(ref, &percpu_count)) { 201 if (__ref_is_percpu(ref, &percpu_count)) {
188 this_cpu_inc(*percpu_count); 202 this_cpu_inc(*percpu_count);
189 ret = true; 203 ret = true;
190 } else { 204 } else {
191 ret = atomic_long_inc_not_zero(&ref->count); 205 ret = atomic_long_inc_not_zero(&ref->count);
192 } 206 }
193 207
194 rcu_read_unlock_sched(); 208 rcu_read_unlock_sched();
195 209
196 return ret; 210 return ret;
197 } 211 }
198 212
199 /** 213 /**
200 * percpu_ref_tryget_live - try to increment a live percpu refcount 214 * percpu_ref_tryget_live - try to increment a live percpu refcount
201 * @ref: percpu_ref to try-get 215 * @ref: percpu_ref to try-get
202 * 216 *
203 * Increment a percpu refcount unless it has already been killed. Returns 217 * Increment a percpu refcount unless it has already been killed. Returns
204 * %true on success; %false on failure. 218 * %true on success; %false on failure.
205 * 219 *
206 * Completion of percpu_ref_kill() in itself doesn't guarantee that this 220 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
207 * function will fail. For such guarantee, percpu_ref_kill_and_confirm() 221 * function will fail. For such guarantee, percpu_ref_kill_and_confirm()
208 * should be used. After the confirm_kill callback is invoked, it's 222 * should be used. After the confirm_kill callback is invoked, it's
209 * guaranteed that no new reference will be given out by 223 * guaranteed that no new reference will be given out by
210 * percpu_ref_tryget_live(). 224 * percpu_ref_tryget_live().
211 * 225 *
212 * This function is safe to call as long as @ref is between init and exit. 226 * This function is safe to call as long as @ref is between init and exit.
213 */ 227 */
214 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) 228 static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
215 { 229 {
216 unsigned long __percpu *percpu_count; 230 unsigned long __percpu *percpu_count;
217 int ret = false; 231 int ret = false;
218 232
219 rcu_read_lock_sched(); 233 rcu_read_lock_sched();
220 234
221 if (__ref_is_percpu(ref, &percpu_count)) { 235 if (__ref_is_percpu(ref, &percpu_count)) {
222 this_cpu_inc(*percpu_count); 236 this_cpu_inc(*percpu_count);
223 ret = true; 237 ret = true;
224 } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { 238 } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) {
225 ret = atomic_long_inc_not_zero(&ref->count); 239 ret = atomic_long_inc_not_zero(&ref->count);
226 } 240 }
227 241
228 rcu_read_unlock_sched(); 242 rcu_read_unlock_sched();
229 243
230 return ret; 244 return ret;
231 } 245 }
232 246
233 /** 247 /**
234 * percpu_ref_put - decrement a percpu refcount 248 * percpu_ref_put_many - decrement a percpu refcount
235 * @ref: percpu_ref to put 249 * @ref: percpu_ref to put
250 * @nr: number of references to put
236 * 251 *
237 * Decrement the refcount, and if 0, call the release function (which was passed 252 * Decrement the refcount, and if 0, call the release function (which was passed
238 * to percpu_ref_init()) 253 * to percpu_ref_init())
239 * 254 *
240 * This function is safe to call as long as @ref is between init and exit. 255 * This function is safe to call as long as @ref is between init and exit.
241 */ 256 */
242 static inline void percpu_ref_put(struct percpu_ref *ref) 257 static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
243 { 258 {
244 unsigned long __percpu *percpu_count; 259 unsigned long __percpu *percpu_count;
245 260
246 rcu_read_lock_sched(); 261 rcu_read_lock_sched();
247 262
248 if (__ref_is_percpu(ref, &percpu_count)) 263 if (__ref_is_percpu(ref, &percpu_count))
249 this_cpu_dec(*percpu_count); 264 this_cpu_sub(*percpu_count, nr);
250 else if (unlikely(atomic_long_dec_and_test(&ref->count))) 265 else if (unlikely(atomic_long_sub_and_test(nr, &ref->count)))
251 ref->release(ref); 266 ref->release(ref);
252 267
253 rcu_read_unlock_sched(); 268 rcu_read_unlock_sched();
269 }
270
271 /**
272 * percpu_ref_put - decrement a percpu refcount
273 * @ref: percpu_ref to put
274 *
275 * Decrement the refcount, and if 0, call the release function (which was passed
276 * to percpu_ref_init())
277 *
278 * This function is safe to call as long as @ref is between init and exit.
279 */
280 static inline void percpu_ref_put(struct percpu_ref *ref)
281 {
282 percpu_ref_put_many(ref, 1);
254 } 283 }
255 284
256 /** 285 /**
257 * percpu_ref_is_zero - test whether a percpu refcount reached zero 286 * percpu_ref_is_zero - test whether a percpu refcount reached zero
258 * @ref: percpu_ref to test 287 * @ref: percpu_ref to test
259 * 288 *
260 * Returns %true if @ref reached zero. 289 * Returns %true if @ref reached zero.
261 * 290 *
262 * This function is safe to call as long as @ref is between init and exit. 291 * This function is safe to call as long as @ref is between init and exit.
263 */ 292 */
264 static inline bool percpu_ref_is_zero(struct percpu_ref *ref) 293 static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
265 { 294 {
266 unsigned long __percpu *percpu_count; 295 unsigned long __percpu *percpu_count;
267 296
268 if (__ref_is_percpu(ref, &percpu_count)) 297 if (__ref_is_percpu(ref, &percpu_count))
269 return false; 298 return false;
270 return !atomic_long_read(&ref->count); 299 return !atomic_long_read(&ref->count);
271 } 300 }
272 301
273 #endif 302 #endif
274 303
1 /* memcontrol.c - Memory Controller 1 /* memcontrol.c - Memory Controller
2 * 2 *
3 * Copyright IBM Corporation, 2007 3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 * 5 *
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds 9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * Kernel Memory Controller 13 * Kernel Memory Controller
14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal 15 * Authors: Glauber Costa and Suleiman Souhlal
16 * 16 *
17 * This program is free software; you can redistribute it and/or modify 17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by 18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or 19 * the Free Software Foundation; either version 2 of the License, or
20 * (at your option) any later version. 20 * (at your option) any later version.
21 * 21 *
22 * This program is distributed in the hope that it will be useful, 22 * This program is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details. 25 * GNU General Public License for more details.
26 */ 26 */
27 27
28 #include <linux/page_counter.h> 28 #include <linux/page_counter.h>
29 #include <linux/memcontrol.h> 29 #include <linux/memcontrol.h>
30 #include <linux/cgroup.h> 30 #include <linux/cgroup.h>
31 #include <linux/mm.h> 31 #include <linux/mm.h>
32 #include <linux/hugetlb.h> 32 #include <linux/hugetlb.h>
33 #include <linux/pagemap.h> 33 #include <linux/pagemap.h>
34 #include <linux/smp.h> 34 #include <linux/smp.h>
35 #include <linux/page-flags.h> 35 #include <linux/page-flags.h>
36 #include <linux/backing-dev.h> 36 #include <linux/backing-dev.h>
37 #include <linux/bit_spinlock.h> 37 #include <linux/bit_spinlock.h>
38 #include <linux/rcupdate.h> 38 #include <linux/rcupdate.h>
39 #include <linux/limits.h> 39 #include <linux/limits.h>
40 #include <linux/export.h> 40 #include <linux/export.h>
41 #include <linux/mutex.h> 41 #include <linux/mutex.h>
42 #include <linux/rbtree.h> 42 #include <linux/rbtree.h>
43 #include <linux/slab.h> 43 #include <linux/slab.h>
44 #include <linux/swap.h> 44 #include <linux/swap.h>
45 #include <linux/swapops.h> 45 #include <linux/swapops.h>
46 #include <linux/spinlock.h> 46 #include <linux/spinlock.h>
47 #include <linux/eventfd.h> 47 #include <linux/eventfd.h>
48 #include <linux/poll.h> 48 #include <linux/poll.h>
49 #include <linux/sort.h> 49 #include <linux/sort.h>
50 #include <linux/fs.h> 50 #include <linux/fs.h>
51 #include <linux/seq_file.h> 51 #include <linux/seq_file.h>
52 #include <linux/vmpressure.h> 52 #include <linux/vmpressure.h>
53 #include <linux/mm_inline.h> 53 #include <linux/mm_inline.h>
54 #include <linux/page_cgroup.h> 54 #include <linux/page_cgroup.h>
55 #include <linux/cpu.h> 55 #include <linux/cpu.h>
56 #include <linux/oom.h> 56 #include <linux/oom.h>
57 #include <linux/lockdep.h> 57 #include <linux/lockdep.h>
58 #include <linux/file.h> 58 #include <linux/file.h>
59 #include "internal.h" 59 #include "internal.h"
60 #include <net/sock.h> 60 #include <net/sock.h>
61 #include <net/ip.h> 61 #include <net/ip.h>
62 #include <net/tcp_memcontrol.h> 62 #include <net/tcp_memcontrol.h>
63 #include "slab.h" 63 #include "slab.h"
64 64
65 #include <asm/uaccess.h> 65 #include <asm/uaccess.h>
66 66
67 #include <trace/events/vmscan.h> 67 #include <trace/events/vmscan.h>
68 68
69 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 69 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
70 EXPORT_SYMBOL(memory_cgrp_subsys); 70 EXPORT_SYMBOL(memory_cgrp_subsys);
71 71
72 #define MEM_CGROUP_RECLAIM_RETRIES 5 72 #define MEM_CGROUP_RECLAIM_RETRIES 5
73 static struct mem_cgroup *root_mem_cgroup __read_mostly; 73 static struct mem_cgroup *root_mem_cgroup __read_mostly;
74 74
75 #ifdef CONFIG_MEMCG_SWAP 75 #ifdef CONFIG_MEMCG_SWAP
76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
77 int do_swap_account __read_mostly; 77 int do_swap_account __read_mostly;
78 78
79 /* for remember boot option*/ 79 /* for remember boot option*/
80 #ifdef CONFIG_MEMCG_SWAP_ENABLED 80 #ifdef CONFIG_MEMCG_SWAP_ENABLED
81 static int really_do_swap_account __initdata = 1; 81 static int really_do_swap_account __initdata = 1;
82 #else 82 #else
83 static int really_do_swap_account __initdata; 83 static int really_do_swap_account __initdata;
84 #endif 84 #endif
85 85
86 #else 86 #else
87 #define do_swap_account 0 87 #define do_swap_account 0
88 #endif 88 #endif
89 89
90 90
91 static const char * const mem_cgroup_stat_names[] = { 91 static const char * const mem_cgroup_stat_names[] = {
92 "cache", 92 "cache",
93 "rss", 93 "rss",
94 "rss_huge", 94 "rss_huge",
95 "mapped_file", 95 "mapped_file",
96 "writeback", 96 "writeback",
97 "swap", 97 "swap",
98 }; 98 };
99 99
100 enum mem_cgroup_events_index { 100 enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
105 MEM_CGROUP_EVENTS_NSTATS, 105 MEM_CGROUP_EVENTS_NSTATS,
106 }; 106 };
107 107
108 static const char * const mem_cgroup_events_names[] = { 108 static const char * const mem_cgroup_events_names[] = {
109 "pgpgin", 109 "pgpgin",
110 "pgpgout", 110 "pgpgout",
111 "pgfault", 111 "pgfault",
112 "pgmajfault", 112 "pgmajfault",
113 }; 113 };
114 114
115 static const char * const mem_cgroup_lru_names[] = { 115 static const char * const mem_cgroup_lru_names[] = {
116 "inactive_anon", 116 "inactive_anon",
117 "active_anon", 117 "active_anon",
118 "inactive_file", 118 "inactive_file",
119 "active_file", 119 "active_file",
120 "unevictable", 120 "unevictable",
121 }; 121 };
122 122
123 /* 123 /*
124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 124 * Per memcg event counter is incremented at every pagein/pageout. With THP,
125 * it will be incremated by the number of pages. This counter is used for 125 * it will be incremated by the number of pages. This counter is used for
126 * for trigger some periodic events. This is straightforward and better 126 * for trigger some periodic events. This is straightforward and better
127 * than using jiffies etc. to handle periodic memcg event. 127 * than using jiffies etc. to handle periodic memcg event.
128 */ 128 */
129 enum mem_cgroup_events_target { 129 enum mem_cgroup_events_target {
130 MEM_CGROUP_TARGET_THRESH, 130 MEM_CGROUP_TARGET_THRESH,
131 MEM_CGROUP_TARGET_SOFTLIMIT, 131 MEM_CGROUP_TARGET_SOFTLIMIT,
132 MEM_CGROUP_TARGET_NUMAINFO, 132 MEM_CGROUP_TARGET_NUMAINFO,
133 MEM_CGROUP_NTARGETS, 133 MEM_CGROUP_NTARGETS,
134 }; 134 };
135 #define THRESHOLDS_EVENTS_TARGET 128 135 #define THRESHOLDS_EVENTS_TARGET 128
136 #define SOFTLIMIT_EVENTS_TARGET 1024 136 #define SOFTLIMIT_EVENTS_TARGET 1024
137 #define NUMAINFO_EVENTS_TARGET 1024 137 #define NUMAINFO_EVENTS_TARGET 1024
138 138
139 struct mem_cgroup_stat_cpu { 139 struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS]; 140 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
142 unsigned long nr_page_events; 142 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS]; 143 unsigned long targets[MEM_CGROUP_NTARGETS];
144 }; 144 };
145 145
146 struct reclaim_iter { 146 struct reclaim_iter {
147 struct mem_cgroup *position; 147 struct mem_cgroup *position;
148 /* scan generation, increased every round-trip */ 148 /* scan generation, increased every round-trip */
149 unsigned int generation; 149 unsigned int generation;
150 }; 150 };
151 151
152 /* 152 /*
153 * per-zone information in memory controller. 153 * per-zone information in memory controller.
154 */ 154 */
155 struct mem_cgroup_per_zone { 155 struct mem_cgroup_per_zone {
156 struct lruvec lruvec; 156 struct lruvec lruvec;
157 unsigned long lru_size[NR_LRU_LISTS]; 157 unsigned long lru_size[NR_LRU_LISTS];
158 158
159 struct reclaim_iter iter[DEF_PRIORITY + 1]; 159 struct reclaim_iter iter[DEF_PRIORITY + 1];
160 160
161 struct rb_node tree_node; /* RB tree node */ 161 struct rb_node tree_node; /* RB tree node */
162 unsigned long usage_in_excess;/* Set to the value by which */ 162 unsigned long usage_in_excess;/* Set to the value by which */
163 /* the soft limit is exceeded*/ 163 /* the soft limit is exceeded*/
164 bool on_tree; 164 bool on_tree;
165 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 165 struct mem_cgroup *memcg; /* Back pointer, we cannot */
166 /* use container_of */ 166 /* use container_of */
167 }; 167 };
168 168
169 struct mem_cgroup_per_node { 169 struct mem_cgroup_per_node {
170 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 170 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
171 }; 171 };
172 172
173 /* 173 /*
174 * Cgroups above their limits are maintained in a RB-Tree, independent of 174 * Cgroups above their limits are maintained in a RB-Tree, independent of
175 * their hierarchy representation 175 * their hierarchy representation
176 */ 176 */
177 177
178 struct mem_cgroup_tree_per_zone { 178 struct mem_cgroup_tree_per_zone {
179 struct rb_root rb_root; 179 struct rb_root rb_root;
180 spinlock_t lock; 180 spinlock_t lock;
181 }; 181 };
182 182
183 struct mem_cgroup_tree_per_node { 183 struct mem_cgroup_tree_per_node {
184 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 184 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
185 }; 185 };
186 186
187 struct mem_cgroup_tree { 187 struct mem_cgroup_tree {
188 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 188 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
189 }; 189 };
190 190
191 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 191 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
192 192
193 struct mem_cgroup_threshold { 193 struct mem_cgroup_threshold {
194 struct eventfd_ctx *eventfd; 194 struct eventfd_ctx *eventfd;
195 unsigned long threshold; 195 unsigned long threshold;
196 }; 196 };
197 197
198 /* For threshold */ 198 /* For threshold */
199 struct mem_cgroup_threshold_ary { 199 struct mem_cgroup_threshold_ary {
200 /* An array index points to threshold just below or equal to usage. */ 200 /* An array index points to threshold just below or equal to usage. */
201 int current_threshold; 201 int current_threshold;
202 /* Size of entries[] */ 202 /* Size of entries[] */
203 unsigned int size; 203 unsigned int size;
204 /* Array of thresholds */ 204 /* Array of thresholds */
205 struct mem_cgroup_threshold entries[0]; 205 struct mem_cgroup_threshold entries[0];
206 }; 206 };
207 207
208 struct mem_cgroup_thresholds { 208 struct mem_cgroup_thresholds {
209 /* Primary thresholds array */ 209 /* Primary thresholds array */
210 struct mem_cgroup_threshold_ary *primary; 210 struct mem_cgroup_threshold_ary *primary;
211 /* 211 /*
212 * Spare threshold array. 212 * Spare threshold array.
213 * This is needed to make mem_cgroup_unregister_event() "never fail". 213 * This is needed to make mem_cgroup_unregister_event() "never fail".
214 * It must be able to store at least primary->size - 1 entries. 214 * It must be able to store at least primary->size - 1 entries.
215 */ 215 */
216 struct mem_cgroup_threshold_ary *spare; 216 struct mem_cgroup_threshold_ary *spare;
217 }; 217 };
218 218
219 /* for OOM */ 219 /* for OOM */
220 struct mem_cgroup_eventfd_list { 220 struct mem_cgroup_eventfd_list {
221 struct list_head list; 221 struct list_head list;
222 struct eventfd_ctx *eventfd; 222 struct eventfd_ctx *eventfd;
223 }; 223 };
224 224
225 /* 225 /*
226 * cgroup_event represents events which userspace want to receive. 226 * cgroup_event represents events which userspace want to receive.
227 */ 227 */
228 struct mem_cgroup_event { 228 struct mem_cgroup_event {
229 /* 229 /*
230 * memcg which the event belongs to. 230 * memcg which the event belongs to.
231 */ 231 */
232 struct mem_cgroup *memcg; 232 struct mem_cgroup *memcg;
233 /* 233 /*
234 * eventfd to signal userspace about the event. 234 * eventfd to signal userspace about the event.
235 */ 235 */
236 struct eventfd_ctx *eventfd; 236 struct eventfd_ctx *eventfd;
237 /* 237 /*
238 * Each of these stored in a list by the cgroup. 238 * Each of these stored in a list by the cgroup.
239 */ 239 */
240 struct list_head list; 240 struct list_head list;
241 /* 241 /*
242 * register_event() callback will be used to add new userspace 242 * register_event() callback will be used to add new userspace
243 * waiter for changes related to this event. Use eventfd_signal() 243 * waiter for changes related to this event. Use eventfd_signal()
244 * on eventfd to send notification to userspace. 244 * on eventfd to send notification to userspace.
245 */ 245 */
246 int (*register_event)(struct mem_cgroup *memcg, 246 int (*register_event)(struct mem_cgroup *memcg,
247 struct eventfd_ctx *eventfd, const char *args); 247 struct eventfd_ctx *eventfd, const char *args);
248 /* 248 /*
249 * unregister_event() callback will be called when userspace closes 249 * unregister_event() callback will be called when userspace closes
250 * the eventfd or on cgroup removing. This callback must be set, 250 * the eventfd or on cgroup removing. This callback must be set,
251 * if you want provide notification functionality. 251 * if you want provide notification functionality.
252 */ 252 */
253 void (*unregister_event)(struct mem_cgroup *memcg, 253 void (*unregister_event)(struct mem_cgroup *memcg,
254 struct eventfd_ctx *eventfd); 254 struct eventfd_ctx *eventfd);
255 /* 255 /*
256 * All fields below needed to unregister event when 256 * All fields below needed to unregister event when
257 * userspace closes eventfd. 257 * userspace closes eventfd.
258 */ 258 */
259 poll_table pt; 259 poll_table pt;
260 wait_queue_head_t *wqh; 260 wait_queue_head_t *wqh;
261 wait_queue_t wait; 261 wait_queue_t wait;
262 struct work_struct remove; 262 struct work_struct remove;
263 }; 263 };
264 264
265 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 265 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
266 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 266 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
267 267
268 /* 268 /*
269 * The memory controller data structure. The memory controller controls both 269 * The memory controller data structure. The memory controller controls both
270 * page cache and RSS per cgroup. We would eventually like to provide 270 * page cache and RSS per cgroup. We would eventually like to provide
271 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 271 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
272 * to help the administrator determine what knobs to tune. 272 * to help the administrator determine what knobs to tune.
273 * 273 *
274 * TODO: Add a water mark for the memory controller. Reclaim will begin when 274 * TODO: Add a water mark for the memory controller. Reclaim will begin when
275 * we hit the water mark. May be even add a low water mark, such that 275 * we hit the water mark. May be even add a low water mark, such that
276 * no reclaim occurs from a cgroup at it's low water mark, this is 276 * no reclaim occurs from a cgroup at it's low water mark, this is
277 * a feature that will be implemented much later in the future. 277 * a feature that will be implemented much later in the future.
278 */ 278 */
279 struct mem_cgroup { 279 struct mem_cgroup {
280 struct cgroup_subsys_state css; 280 struct cgroup_subsys_state css;
281 281
282 /* Accounted resources */ 282 /* Accounted resources */
283 struct page_counter memory; 283 struct page_counter memory;
284 struct page_counter memsw; 284 struct page_counter memsw;
285 struct page_counter kmem; 285 struct page_counter kmem;
286 286
287 unsigned long soft_limit; 287 unsigned long soft_limit;
288 288
289 /* vmpressure notifications */ 289 /* vmpressure notifications */
290 struct vmpressure vmpressure; 290 struct vmpressure vmpressure;
291 291
292 /* css_online() has been completed */ 292 /* css_online() has been completed */
293 int initialized; 293 int initialized;
294 294
295 /* 295 /*
296 * Should the accounting and control be hierarchical, per subtree? 296 * Should the accounting and control be hierarchical, per subtree?
297 */ 297 */
298 bool use_hierarchy; 298 bool use_hierarchy;
299 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 299 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
300 300
301 bool oom_lock; 301 bool oom_lock;
302 atomic_t under_oom; 302 atomic_t under_oom;
303 atomic_t oom_wakeups; 303 atomic_t oom_wakeups;
304 304
305 int swappiness; 305 int swappiness;
306 /* OOM-Killer disable */ 306 /* OOM-Killer disable */
307 int oom_kill_disable; 307 int oom_kill_disable;
308 308
309 /* protect arrays of thresholds */ 309 /* protect arrays of thresholds */
310 struct mutex thresholds_lock; 310 struct mutex thresholds_lock;
311 311
312 /* thresholds for memory usage. RCU-protected */ 312 /* thresholds for memory usage. RCU-protected */
313 struct mem_cgroup_thresholds thresholds; 313 struct mem_cgroup_thresholds thresholds;
314 314
315 /* thresholds for mem+swap usage. RCU-protected */ 315 /* thresholds for mem+swap usage. RCU-protected */
316 struct mem_cgroup_thresholds memsw_thresholds; 316 struct mem_cgroup_thresholds memsw_thresholds;
317 317
318 /* For oom notifier event fd */ 318 /* For oom notifier event fd */
319 struct list_head oom_notify; 319 struct list_head oom_notify;
320 320
321 /* 321 /*
322 * Should we move charges of a task when a task is moved into this 322 * Should we move charges of a task when a task is moved into this
323 * mem_cgroup ? And what type of charges should we move ? 323 * mem_cgroup ? And what type of charges should we move ?
324 */ 324 */
325 unsigned long move_charge_at_immigrate; 325 unsigned long move_charge_at_immigrate;
326 /* 326 /*
327 * set > 0 if pages under this cgroup are moving to other cgroup. 327 * set > 0 if pages under this cgroup are moving to other cgroup.
328 */ 328 */
329 atomic_t moving_account; 329 atomic_t moving_account;
330 /* taken only while moving_account > 0 */ 330 /* taken only while moving_account > 0 */
331 spinlock_t move_lock; 331 spinlock_t move_lock;
332 /* 332 /*
333 * percpu counter. 333 * percpu counter.
334 */ 334 */
335 struct mem_cgroup_stat_cpu __percpu *stat; 335 struct mem_cgroup_stat_cpu __percpu *stat;
336 /* 336 /*
337 * used when a cpu is offlined or other synchronizations 337 * used when a cpu is offlined or other synchronizations
338 * See mem_cgroup_read_stat(). 338 * See mem_cgroup_read_stat().
339 */ 339 */
340 struct mem_cgroup_stat_cpu nocpu_base; 340 struct mem_cgroup_stat_cpu nocpu_base;
341 spinlock_t pcp_counter_lock; 341 spinlock_t pcp_counter_lock;
342 342
343 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 343 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
344 struct cg_proto tcp_mem; 344 struct cg_proto tcp_mem;
345 #endif 345 #endif
346 #if defined(CONFIG_MEMCG_KMEM) 346 #if defined(CONFIG_MEMCG_KMEM)
347 /* analogous to slab_common's slab_caches list, but per-memcg; 347 /* analogous to slab_common's slab_caches list, but per-memcg;
348 * protected by memcg_slab_mutex */ 348 * protected by memcg_slab_mutex */
349 struct list_head memcg_slab_caches; 349 struct list_head memcg_slab_caches;
350 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 350 /* Index in the kmem_cache->memcg_params->memcg_caches array */
351 int kmemcg_id; 351 int kmemcg_id;
352 #endif 352 #endif
353 353
354 int last_scanned_node; 354 int last_scanned_node;
355 #if MAX_NUMNODES > 1 355 #if MAX_NUMNODES > 1
356 nodemask_t scan_nodes; 356 nodemask_t scan_nodes;
357 atomic_t numainfo_events; 357 atomic_t numainfo_events;
358 atomic_t numainfo_updating; 358 atomic_t numainfo_updating;
359 #endif 359 #endif
360 360
361 /* List of events which userspace want to receive */ 361 /* List of events which userspace want to receive */
362 struct list_head event_list; 362 struct list_head event_list;
363 spinlock_t event_list_lock; 363 spinlock_t event_list_lock;
364 364
365 struct mem_cgroup_per_node *nodeinfo[0]; 365 struct mem_cgroup_per_node *nodeinfo[0];
366 /* WARNING: nodeinfo must be the last member here */ 366 /* WARNING: nodeinfo must be the last member here */
367 }; 367 };
368 368
369 /* internal only representation about the status of kmem accounting. */ 369 /* internal only representation about the status of kmem accounting. */
370 enum { 370 enum {
371 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ 371 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
372 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 372 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
373 }; 373 };
374 374
375 #ifdef CONFIG_MEMCG_KMEM 375 #ifdef CONFIG_MEMCG_KMEM
376 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 376 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
377 { 377 {
378 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 378 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
379 } 379 }
380 380
381 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 381 static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
382 { 382 {
383 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 383 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
384 } 384 }
385 385
386 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 386 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
387 { 387 {
388 /* 388 /*
389 * Our caller must use css_get() first, because memcg_uncharge_kmem() 389 * Our caller must use css_get() first, because memcg_uncharge_kmem()
390 * will call css_put() if it sees the memcg is dead. 390 * will call css_put() if it sees the memcg is dead.
391 */ 391 */
392 smp_wmb(); 392 smp_wmb();
393 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 393 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
394 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 394 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
395 } 395 }
396 396
397 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 397 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
398 { 398 {
399 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 399 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
400 &memcg->kmem_account_flags); 400 &memcg->kmem_account_flags);
401 } 401 }
402 #endif 402 #endif
403 403
404 /* Stuffs for move charges at task migration. */ 404 /* Stuffs for move charges at task migration. */
405 /* 405 /*
406 * Types of charges to be moved. "move_charge_at_immitgrate" and 406 * Types of charges to be moved. "move_charge_at_immitgrate" and
407 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 407 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
408 */ 408 */
409 enum move_type { 409 enum move_type {
410 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 410 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
411 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 411 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
412 NR_MOVE_TYPE, 412 NR_MOVE_TYPE,
413 }; 413 };
414 414
415 /* "mc" and its members are protected by cgroup_mutex */ 415 /* "mc" and its members are protected by cgroup_mutex */
416 static struct move_charge_struct { 416 static struct move_charge_struct {
417 spinlock_t lock; /* for from, to */ 417 spinlock_t lock; /* for from, to */
418 struct mem_cgroup *from; 418 struct mem_cgroup *from;
419 struct mem_cgroup *to; 419 struct mem_cgroup *to;
420 unsigned long immigrate_flags; 420 unsigned long immigrate_flags;
421 unsigned long precharge; 421 unsigned long precharge;
422 unsigned long moved_charge; 422 unsigned long moved_charge;
423 unsigned long moved_swap; 423 unsigned long moved_swap;
424 struct task_struct *moving_task; /* a task moving charges */ 424 struct task_struct *moving_task; /* a task moving charges */
425 wait_queue_head_t waitq; /* a waitq for other context */ 425 wait_queue_head_t waitq; /* a waitq for other context */
426 } mc = { 426 } mc = {
427 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 427 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
428 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 428 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
429 }; 429 };
430 430
431 static bool move_anon(void) 431 static bool move_anon(void)
432 { 432 {
433 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 433 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
434 } 434 }
435 435
436 static bool move_file(void) 436 static bool move_file(void)
437 { 437 {
438 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 438 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
439 } 439 }
440 440
441 /* 441 /*
442 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 442 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
443 * limit reclaim to prevent infinite loops, if they ever occur. 443 * limit reclaim to prevent infinite loops, if they ever occur.
444 */ 444 */
445 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 445 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
446 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 446 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
447 447
448 enum charge_type { 448 enum charge_type {
449 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 449 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
450 MEM_CGROUP_CHARGE_TYPE_ANON, 450 MEM_CGROUP_CHARGE_TYPE_ANON,
451 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 451 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
452 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 452 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
453 NR_CHARGE_TYPE, 453 NR_CHARGE_TYPE,
454 }; 454 };
455 455
456 /* for encoding cft->private value on file */ 456 /* for encoding cft->private value on file */
457 enum res_type { 457 enum res_type {
458 _MEM, 458 _MEM,
459 _MEMSWAP, 459 _MEMSWAP,
460 _OOM_TYPE, 460 _OOM_TYPE,
461 _KMEM, 461 _KMEM,
462 }; 462 };
463 463
464 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 464 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
465 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 465 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
466 #define MEMFILE_ATTR(val) ((val) & 0xffff) 466 #define MEMFILE_ATTR(val) ((val) & 0xffff)
467 /* Used for OOM nofiier */ 467 /* Used for OOM nofiier */
468 #define OOM_CONTROL (0) 468 #define OOM_CONTROL (0)
469 469
470 /* 470 /*
471 * The memcg_create_mutex will be held whenever a new cgroup is created. 471 * The memcg_create_mutex will be held whenever a new cgroup is created.
472 * As a consequence, any change that needs to protect against new child cgroups 472 * As a consequence, any change that needs to protect against new child cgroups
473 * appearing has to hold it as well. 473 * appearing has to hold it as well.
474 */ 474 */
475 static DEFINE_MUTEX(memcg_create_mutex); 475 static DEFINE_MUTEX(memcg_create_mutex);
476 476
477 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 477 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
478 { 478 {
479 return s ? container_of(s, struct mem_cgroup, css) : NULL; 479 return s ? container_of(s, struct mem_cgroup, css) : NULL;
480 } 480 }
481 481
482 /* Some nice accessors for the vmpressure. */ 482 /* Some nice accessors for the vmpressure. */
483 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 483 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
484 { 484 {
485 if (!memcg) 485 if (!memcg)
486 memcg = root_mem_cgroup; 486 memcg = root_mem_cgroup;
487 return &memcg->vmpressure; 487 return &memcg->vmpressure;
488 } 488 }
489 489
490 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 490 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
491 { 491 {
492 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 492 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
493 } 493 }
494 494
495 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 495 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
496 { 496 {
497 return (memcg == root_mem_cgroup); 497 return (memcg == root_mem_cgroup);
498 } 498 }
499 499
500 /* 500 /*
501 * We restrict the id in the range of [1, 65535], so it can fit into 501 * We restrict the id in the range of [1, 65535], so it can fit into
502 * an unsigned short. 502 * an unsigned short.
503 */ 503 */
504 #define MEM_CGROUP_ID_MAX USHRT_MAX 504 #define MEM_CGROUP_ID_MAX USHRT_MAX
505 505
506 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 506 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
507 { 507 {
508 return memcg->css.id; 508 return memcg->css.id;
509 } 509 }
510 510
511 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 511 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
512 { 512 {
513 struct cgroup_subsys_state *css; 513 struct cgroup_subsys_state *css;
514 514
515 css = css_from_id(id, &memory_cgrp_subsys); 515 css = css_from_id(id, &memory_cgrp_subsys);
516 return mem_cgroup_from_css(css); 516 return mem_cgroup_from_css(css);
517 } 517 }
518 518
519 /* Writing them here to avoid exposing memcg's inner layout */ 519 /* Writing them here to avoid exposing memcg's inner layout */
520 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 520 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
521 521
522 void sock_update_memcg(struct sock *sk) 522 void sock_update_memcg(struct sock *sk)
523 { 523 {
524 if (mem_cgroup_sockets_enabled) { 524 if (mem_cgroup_sockets_enabled) {
525 struct mem_cgroup *memcg; 525 struct mem_cgroup *memcg;
526 struct cg_proto *cg_proto; 526 struct cg_proto *cg_proto;
527 527
528 BUG_ON(!sk->sk_prot->proto_cgroup); 528 BUG_ON(!sk->sk_prot->proto_cgroup);
529 529
530 /* Socket cloning can throw us here with sk_cgrp already 530 /* Socket cloning can throw us here with sk_cgrp already
531 * filled. It won't however, necessarily happen from 531 * filled. It won't however, necessarily happen from
532 * process context. So the test for root memcg given 532 * process context. So the test for root memcg given
533 * the current task's memcg won't help us in this case. 533 * the current task's memcg won't help us in this case.
534 * 534 *
535 * Respecting the original socket's memcg is a better 535 * Respecting the original socket's memcg is a better
536 * decision in this case. 536 * decision in this case.
537 */ 537 */
538 if (sk->sk_cgrp) { 538 if (sk->sk_cgrp) {
539 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 539 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
540 css_get(&sk->sk_cgrp->memcg->css); 540 css_get(&sk->sk_cgrp->memcg->css);
541 return; 541 return;
542 } 542 }
543 543
544 rcu_read_lock(); 544 rcu_read_lock();
545 memcg = mem_cgroup_from_task(current); 545 memcg = mem_cgroup_from_task(current);
546 cg_proto = sk->sk_prot->proto_cgroup(memcg); 546 cg_proto = sk->sk_prot->proto_cgroup(memcg);
547 if (!mem_cgroup_is_root(memcg) && 547 if (!mem_cgroup_is_root(memcg) &&
548 memcg_proto_active(cg_proto) && 548 memcg_proto_active(cg_proto) &&
549 css_tryget_online(&memcg->css)) { 549 css_tryget_online(&memcg->css)) {
550 sk->sk_cgrp = cg_proto; 550 sk->sk_cgrp = cg_proto;
551 } 551 }
552 rcu_read_unlock(); 552 rcu_read_unlock();
553 } 553 }
554 } 554 }
555 EXPORT_SYMBOL(sock_update_memcg); 555 EXPORT_SYMBOL(sock_update_memcg);
556 556
557 void sock_release_memcg(struct sock *sk) 557 void sock_release_memcg(struct sock *sk)
558 { 558 {
559 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 559 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
560 struct mem_cgroup *memcg; 560 struct mem_cgroup *memcg;
561 WARN_ON(!sk->sk_cgrp->memcg); 561 WARN_ON(!sk->sk_cgrp->memcg);
562 memcg = sk->sk_cgrp->memcg; 562 memcg = sk->sk_cgrp->memcg;
563 css_put(&sk->sk_cgrp->memcg->css); 563 css_put(&sk->sk_cgrp->memcg->css);
564 } 564 }
565 } 565 }
566 566
567 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 567 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
568 { 568 {
569 if (!memcg || mem_cgroup_is_root(memcg)) 569 if (!memcg || mem_cgroup_is_root(memcg))
570 return NULL; 570 return NULL;
571 571
572 return &memcg->tcp_mem; 572 return &memcg->tcp_mem;
573 } 573 }
574 EXPORT_SYMBOL(tcp_proto_cgroup); 574 EXPORT_SYMBOL(tcp_proto_cgroup);
575 575
576 static void disarm_sock_keys(struct mem_cgroup *memcg) 576 static void disarm_sock_keys(struct mem_cgroup *memcg)
577 { 577 {
578 if (!memcg_proto_activated(&memcg->tcp_mem)) 578 if (!memcg_proto_activated(&memcg->tcp_mem))
579 return; 579 return;
580 static_key_slow_dec(&memcg_socket_limit_enabled); 580 static_key_slow_dec(&memcg_socket_limit_enabled);
581 } 581 }
582 #else 582 #else
583 static void disarm_sock_keys(struct mem_cgroup *memcg) 583 static void disarm_sock_keys(struct mem_cgroup *memcg)
584 { 584 {
585 } 585 }
586 #endif 586 #endif
587 587
588 #ifdef CONFIG_MEMCG_KMEM 588 #ifdef CONFIG_MEMCG_KMEM
589 /* 589 /*
590 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 590 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
591 * The main reason for not using cgroup id for this: 591 * The main reason for not using cgroup id for this:
592 * this works better in sparse environments, where we have a lot of memcgs, 592 * this works better in sparse environments, where we have a lot of memcgs,
593 * but only a few kmem-limited. Or also, if we have, for instance, 200 593 * but only a few kmem-limited. Or also, if we have, for instance, 200
594 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 594 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
595 * 200 entry array for that. 595 * 200 entry array for that.
596 * 596 *
597 * The current size of the caches array is stored in 597 * The current size of the caches array is stored in
598 * memcg_limited_groups_array_size. It will double each time we have to 598 * memcg_limited_groups_array_size. It will double each time we have to
599 * increase it. 599 * increase it.
600 */ 600 */
601 static DEFINE_IDA(kmem_limited_groups); 601 static DEFINE_IDA(kmem_limited_groups);
602 int memcg_limited_groups_array_size; 602 int memcg_limited_groups_array_size;
603 603
604 /* 604 /*
605 * MIN_SIZE is different than 1, because we would like to avoid going through 605 * MIN_SIZE is different than 1, because we would like to avoid going through
606 * the alloc/free process all the time. In a small machine, 4 kmem-limited 606 * the alloc/free process all the time. In a small machine, 4 kmem-limited
607 * cgroups is a reasonable guess. In the future, it could be a parameter or 607 * cgroups is a reasonable guess. In the future, it could be a parameter or
608 * tunable, but that is strictly not necessary. 608 * tunable, but that is strictly not necessary.
609 * 609 *
610 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 610 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
611 * this constant directly from cgroup, but it is understandable that this is 611 * this constant directly from cgroup, but it is understandable that this is
612 * better kept as an internal representation in cgroup.c. In any case, the 612 * better kept as an internal representation in cgroup.c. In any case, the
613 * cgrp_id space is not getting any smaller, and we don't have to necessarily 613 * cgrp_id space is not getting any smaller, and we don't have to necessarily
614 * increase ours as well if it increases. 614 * increase ours as well if it increases.
615 */ 615 */
616 #define MEMCG_CACHES_MIN_SIZE 4 616 #define MEMCG_CACHES_MIN_SIZE 4
617 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 617 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
618 618
619 /* 619 /*
620 * A lot of the calls to the cache allocation functions are expected to be 620 * A lot of the calls to the cache allocation functions are expected to be
621 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 621 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
622 * conditional to this static branch, we'll have to allow modules that does 622 * conditional to this static branch, we'll have to allow modules that does
623 * kmem_cache_alloc and the such to see this symbol as well 623 * kmem_cache_alloc and the such to see this symbol as well
624 */ 624 */
625 struct static_key memcg_kmem_enabled_key; 625 struct static_key memcg_kmem_enabled_key;
626 EXPORT_SYMBOL(memcg_kmem_enabled_key); 626 EXPORT_SYMBOL(memcg_kmem_enabled_key);
627 627
628 static void memcg_free_cache_id(int id); 628 static void memcg_free_cache_id(int id);
629 629
630 static void disarm_kmem_keys(struct mem_cgroup *memcg) 630 static void disarm_kmem_keys(struct mem_cgroup *memcg)
631 { 631 {
632 if (memcg_kmem_is_active(memcg)) { 632 if (memcg_kmem_is_active(memcg)) {
633 static_key_slow_dec(&memcg_kmem_enabled_key); 633 static_key_slow_dec(&memcg_kmem_enabled_key);
634 memcg_free_cache_id(memcg->kmemcg_id); 634 memcg_free_cache_id(memcg->kmemcg_id);
635 } 635 }
636 /* 636 /*
637 * This check can't live in kmem destruction function, 637 * This check can't live in kmem destruction function,
638 * since the charges will outlive the cgroup 638 * since the charges will outlive the cgroup
639 */ 639 */
640 WARN_ON(page_counter_read(&memcg->kmem)); 640 WARN_ON(page_counter_read(&memcg->kmem));
641 } 641 }
642 #else 642 #else
643 static void disarm_kmem_keys(struct mem_cgroup *memcg) 643 static void disarm_kmem_keys(struct mem_cgroup *memcg)
644 { 644 {
645 } 645 }
646 #endif /* CONFIG_MEMCG_KMEM */ 646 #endif /* CONFIG_MEMCG_KMEM */
647 647
648 static void disarm_static_keys(struct mem_cgroup *memcg) 648 static void disarm_static_keys(struct mem_cgroup *memcg)
649 { 649 {
650 disarm_sock_keys(memcg); 650 disarm_sock_keys(memcg);
651 disarm_kmem_keys(memcg); 651 disarm_kmem_keys(memcg);
652 } 652 }
653 653
654 static void drain_all_stock_async(struct mem_cgroup *memcg); 654 static void drain_all_stock_async(struct mem_cgroup *memcg);
655 655
656 static struct mem_cgroup_per_zone * 656 static struct mem_cgroup_per_zone *
657 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 657 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
658 { 658 {
659 int nid = zone_to_nid(zone); 659 int nid = zone_to_nid(zone);
660 int zid = zone_idx(zone); 660 int zid = zone_idx(zone);
661 661
662 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 662 return &memcg->nodeinfo[nid]->zoneinfo[zid];
663 } 663 }
664 664
665 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 665 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
666 { 666 {
667 return &memcg->css; 667 return &memcg->css;
668 } 668 }
669 669
670 static struct mem_cgroup_per_zone * 670 static struct mem_cgroup_per_zone *
671 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) 671 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
672 { 672 {
673 int nid = page_to_nid(page); 673 int nid = page_to_nid(page);
674 int zid = page_zonenum(page); 674 int zid = page_zonenum(page);
675 675
676 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 676 return &memcg->nodeinfo[nid]->zoneinfo[zid];
677 } 677 }
678 678
679 static struct mem_cgroup_tree_per_zone * 679 static struct mem_cgroup_tree_per_zone *
680 soft_limit_tree_node_zone(int nid, int zid) 680 soft_limit_tree_node_zone(int nid, int zid)
681 { 681 {
682 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 682 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
683 } 683 }
684 684
685 static struct mem_cgroup_tree_per_zone * 685 static struct mem_cgroup_tree_per_zone *
686 soft_limit_tree_from_page(struct page *page) 686 soft_limit_tree_from_page(struct page *page)
687 { 687 {
688 int nid = page_to_nid(page); 688 int nid = page_to_nid(page);
689 int zid = page_zonenum(page); 689 int zid = page_zonenum(page);
690 690
691 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 691 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
692 } 692 }
693 693
694 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, 694 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
695 struct mem_cgroup_tree_per_zone *mctz, 695 struct mem_cgroup_tree_per_zone *mctz,
696 unsigned long new_usage_in_excess) 696 unsigned long new_usage_in_excess)
697 { 697 {
698 struct rb_node **p = &mctz->rb_root.rb_node; 698 struct rb_node **p = &mctz->rb_root.rb_node;
699 struct rb_node *parent = NULL; 699 struct rb_node *parent = NULL;
700 struct mem_cgroup_per_zone *mz_node; 700 struct mem_cgroup_per_zone *mz_node;
701 701
702 if (mz->on_tree) 702 if (mz->on_tree)
703 return; 703 return;
704 704
705 mz->usage_in_excess = new_usage_in_excess; 705 mz->usage_in_excess = new_usage_in_excess;
706 if (!mz->usage_in_excess) 706 if (!mz->usage_in_excess)
707 return; 707 return;
708 while (*p) { 708 while (*p) {
709 parent = *p; 709 parent = *p;
710 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 710 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
711 tree_node); 711 tree_node);
712 if (mz->usage_in_excess < mz_node->usage_in_excess) 712 if (mz->usage_in_excess < mz_node->usage_in_excess)
713 p = &(*p)->rb_left; 713 p = &(*p)->rb_left;
714 /* 714 /*
715 * We can't avoid mem cgroups that are over their soft 715 * We can't avoid mem cgroups that are over their soft
716 * limit by the same amount 716 * limit by the same amount
717 */ 717 */
718 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 718 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
719 p = &(*p)->rb_right; 719 p = &(*p)->rb_right;
720 } 720 }
721 rb_link_node(&mz->tree_node, parent, p); 721 rb_link_node(&mz->tree_node, parent, p);
722 rb_insert_color(&mz->tree_node, &mctz->rb_root); 722 rb_insert_color(&mz->tree_node, &mctz->rb_root);
723 mz->on_tree = true; 723 mz->on_tree = true;
724 } 724 }
725 725
726 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 726 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
727 struct mem_cgroup_tree_per_zone *mctz) 727 struct mem_cgroup_tree_per_zone *mctz)
728 { 728 {
729 if (!mz->on_tree) 729 if (!mz->on_tree)
730 return; 730 return;
731 rb_erase(&mz->tree_node, &mctz->rb_root); 731 rb_erase(&mz->tree_node, &mctz->rb_root);
732 mz->on_tree = false; 732 mz->on_tree = false;
733 } 733 }
734 734
735 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, 735 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
736 struct mem_cgroup_tree_per_zone *mctz) 736 struct mem_cgroup_tree_per_zone *mctz)
737 { 737 {
738 unsigned long flags; 738 unsigned long flags;
739 739
740 spin_lock_irqsave(&mctz->lock, flags); 740 spin_lock_irqsave(&mctz->lock, flags);
741 __mem_cgroup_remove_exceeded(mz, mctz); 741 __mem_cgroup_remove_exceeded(mz, mctz);
742 spin_unlock_irqrestore(&mctz->lock, flags); 742 spin_unlock_irqrestore(&mctz->lock, flags);
743 } 743 }
744 744
745 static unsigned long soft_limit_excess(struct mem_cgroup *memcg) 745 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
746 { 746 {
747 unsigned long nr_pages = page_counter_read(&memcg->memory); 747 unsigned long nr_pages = page_counter_read(&memcg->memory);
748 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); 748 unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit);
749 unsigned long excess = 0; 749 unsigned long excess = 0;
750 750
751 if (nr_pages > soft_limit) 751 if (nr_pages > soft_limit)
752 excess = nr_pages - soft_limit; 752 excess = nr_pages - soft_limit;
753 753
754 return excess; 754 return excess;
755 } 755 }
756 756
757 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 757 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
758 { 758 {
759 unsigned long excess; 759 unsigned long excess;
760 struct mem_cgroup_per_zone *mz; 760 struct mem_cgroup_per_zone *mz;
761 struct mem_cgroup_tree_per_zone *mctz; 761 struct mem_cgroup_tree_per_zone *mctz;
762 762
763 mctz = soft_limit_tree_from_page(page); 763 mctz = soft_limit_tree_from_page(page);
764 /* 764 /*
765 * Necessary to update all ancestors when hierarchy is used. 765 * Necessary to update all ancestors when hierarchy is used.
766 * because their event counter is not touched. 766 * because their event counter is not touched.
767 */ 767 */
768 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 768 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
769 mz = mem_cgroup_page_zoneinfo(memcg, page); 769 mz = mem_cgroup_page_zoneinfo(memcg, page);
770 excess = soft_limit_excess(memcg); 770 excess = soft_limit_excess(memcg);
771 /* 771 /*
772 * We have to update the tree if mz is on RB-tree or 772 * We have to update the tree if mz is on RB-tree or
773 * mem is over its softlimit. 773 * mem is over its softlimit.
774 */ 774 */
775 if (excess || mz->on_tree) { 775 if (excess || mz->on_tree) {
776 unsigned long flags; 776 unsigned long flags;
777 777
778 spin_lock_irqsave(&mctz->lock, flags); 778 spin_lock_irqsave(&mctz->lock, flags);
779 /* if on-tree, remove it */ 779 /* if on-tree, remove it */
780 if (mz->on_tree) 780 if (mz->on_tree)
781 __mem_cgroup_remove_exceeded(mz, mctz); 781 __mem_cgroup_remove_exceeded(mz, mctz);
782 /* 782 /*
783 * Insert again. mz->usage_in_excess will be updated. 783 * Insert again. mz->usage_in_excess will be updated.
784 * If excess is 0, no tree ops. 784 * If excess is 0, no tree ops.
785 */ 785 */
786 __mem_cgroup_insert_exceeded(mz, mctz, excess); 786 __mem_cgroup_insert_exceeded(mz, mctz, excess);
787 spin_unlock_irqrestore(&mctz->lock, flags); 787 spin_unlock_irqrestore(&mctz->lock, flags);
788 } 788 }
789 } 789 }
790 } 790 }
791 791
792 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 792 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
793 { 793 {
794 struct mem_cgroup_tree_per_zone *mctz; 794 struct mem_cgroup_tree_per_zone *mctz;
795 struct mem_cgroup_per_zone *mz; 795 struct mem_cgroup_per_zone *mz;
796 int nid, zid; 796 int nid, zid;
797 797
798 for_each_node(nid) { 798 for_each_node(nid) {
799 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 799 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
800 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 800 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
801 mctz = soft_limit_tree_node_zone(nid, zid); 801 mctz = soft_limit_tree_node_zone(nid, zid);
802 mem_cgroup_remove_exceeded(mz, mctz); 802 mem_cgroup_remove_exceeded(mz, mctz);
803 } 803 }
804 } 804 }
805 } 805 }
806 806
807 static struct mem_cgroup_per_zone * 807 static struct mem_cgroup_per_zone *
808 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 808 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
809 { 809 {
810 struct rb_node *rightmost = NULL; 810 struct rb_node *rightmost = NULL;
811 struct mem_cgroup_per_zone *mz; 811 struct mem_cgroup_per_zone *mz;
812 812
813 retry: 813 retry:
814 mz = NULL; 814 mz = NULL;
815 rightmost = rb_last(&mctz->rb_root); 815 rightmost = rb_last(&mctz->rb_root);
816 if (!rightmost) 816 if (!rightmost)
817 goto done; /* Nothing to reclaim from */ 817 goto done; /* Nothing to reclaim from */
818 818
819 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 819 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
820 /* 820 /*
821 * Remove the node now but someone else can add it back, 821 * Remove the node now but someone else can add it back,
822 * we will to add it back at the end of reclaim to its correct 822 * we will to add it back at the end of reclaim to its correct
823 * position in the tree. 823 * position in the tree.
824 */ 824 */
825 __mem_cgroup_remove_exceeded(mz, mctz); 825 __mem_cgroup_remove_exceeded(mz, mctz);
826 if (!soft_limit_excess(mz->memcg) || 826 if (!soft_limit_excess(mz->memcg) ||
827 !css_tryget_online(&mz->memcg->css)) 827 !css_tryget_online(&mz->memcg->css))
828 goto retry; 828 goto retry;
829 done: 829 done:
830 return mz; 830 return mz;
831 } 831 }
832 832
833 static struct mem_cgroup_per_zone * 833 static struct mem_cgroup_per_zone *
834 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 834 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
835 { 835 {
836 struct mem_cgroup_per_zone *mz; 836 struct mem_cgroup_per_zone *mz;
837 837
838 spin_lock_irq(&mctz->lock); 838 spin_lock_irq(&mctz->lock);
839 mz = __mem_cgroup_largest_soft_limit_node(mctz); 839 mz = __mem_cgroup_largest_soft_limit_node(mctz);
840 spin_unlock_irq(&mctz->lock); 840 spin_unlock_irq(&mctz->lock);
841 return mz; 841 return mz;
842 } 842 }
843 843
844 /* 844 /*
845 * Implementation Note: reading percpu statistics for memcg. 845 * Implementation Note: reading percpu statistics for memcg.
846 * 846 *
847 * Both of vmstat[] and percpu_counter has threshold and do periodic 847 * Both of vmstat[] and percpu_counter has threshold and do periodic
848 * synchronization to implement "quick" read. There are trade-off between 848 * synchronization to implement "quick" read. There are trade-off between
849 * reading cost and precision of value. Then, we may have a chance to implement 849 * reading cost and precision of value. Then, we may have a chance to implement
850 * a periodic synchronizion of counter in memcg's counter. 850 * a periodic synchronizion of counter in memcg's counter.
851 * 851 *
852 * But this _read() function is used for user interface now. The user accounts 852 * But this _read() function is used for user interface now. The user accounts
853 * memory usage by memory cgroup and he _always_ requires exact value because 853 * memory usage by memory cgroup and he _always_ requires exact value because
854 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 854 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
855 * have to visit all online cpus and make sum. So, for now, unnecessary 855 * have to visit all online cpus and make sum. So, for now, unnecessary
856 * synchronization is not implemented. (just implemented for cpu hotplug) 856 * synchronization is not implemented. (just implemented for cpu hotplug)
857 * 857 *
858 * If there are kernel internal actions which can make use of some not-exact 858 * If there are kernel internal actions which can make use of some not-exact
859 * value, and reading all cpu value can be performance bottleneck in some 859 * value, and reading all cpu value can be performance bottleneck in some
860 * common workload, threashold and synchonization as vmstat[] should be 860 * common workload, threashold and synchonization as vmstat[] should be
861 * implemented. 861 * implemented.
862 */ 862 */
863 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 863 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
864 enum mem_cgroup_stat_index idx) 864 enum mem_cgroup_stat_index idx)
865 { 865 {
866 long val = 0; 866 long val = 0;
867 int cpu; 867 int cpu;
868 868
869 get_online_cpus(); 869 get_online_cpus();
870 for_each_online_cpu(cpu) 870 for_each_online_cpu(cpu)
871 val += per_cpu(memcg->stat->count[idx], cpu); 871 val += per_cpu(memcg->stat->count[idx], cpu);
872 #ifdef CONFIG_HOTPLUG_CPU 872 #ifdef CONFIG_HOTPLUG_CPU
873 spin_lock(&memcg->pcp_counter_lock); 873 spin_lock(&memcg->pcp_counter_lock);
874 val += memcg->nocpu_base.count[idx]; 874 val += memcg->nocpu_base.count[idx];
875 spin_unlock(&memcg->pcp_counter_lock); 875 spin_unlock(&memcg->pcp_counter_lock);
876 #endif 876 #endif
877 put_online_cpus(); 877 put_online_cpus();
878 return val; 878 return val;
879 } 879 }
880 880
881 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 881 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
882 enum mem_cgroup_events_index idx) 882 enum mem_cgroup_events_index idx)
883 { 883 {
884 unsigned long val = 0; 884 unsigned long val = 0;
885 int cpu; 885 int cpu;
886 886
887 get_online_cpus(); 887 get_online_cpus();
888 for_each_online_cpu(cpu) 888 for_each_online_cpu(cpu)
889 val += per_cpu(memcg->stat->events[idx], cpu); 889 val += per_cpu(memcg->stat->events[idx], cpu);
890 #ifdef CONFIG_HOTPLUG_CPU 890 #ifdef CONFIG_HOTPLUG_CPU
891 spin_lock(&memcg->pcp_counter_lock); 891 spin_lock(&memcg->pcp_counter_lock);
892 val += memcg->nocpu_base.events[idx]; 892 val += memcg->nocpu_base.events[idx];
893 spin_unlock(&memcg->pcp_counter_lock); 893 spin_unlock(&memcg->pcp_counter_lock);
894 #endif 894 #endif
895 put_online_cpus(); 895 put_online_cpus();
896 return val; 896 return val;
897 } 897 }
898 898
899 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 899 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
900 struct page *page, 900 struct page *page,
901 int nr_pages) 901 int nr_pages)
902 { 902 {
903 /* 903 /*
904 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 904 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
905 * counted as CACHE even if it's on ANON LRU. 905 * counted as CACHE even if it's on ANON LRU.
906 */ 906 */
907 if (PageAnon(page)) 907 if (PageAnon(page))
908 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 908 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
909 nr_pages); 909 nr_pages);
910 else 910 else
911 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 911 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
912 nr_pages); 912 nr_pages);
913 913
914 if (PageTransHuge(page)) 914 if (PageTransHuge(page))
915 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 915 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
916 nr_pages); 916 nr_pages);
917 917
918 /* pagein of a big page is an event. So, ignore page size */ 918 /* pagein of a big page is an event. So, ignore page size */
919 if (nr_pages > 0) 919 if (nr_pages > 0)
920 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 920 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
921 else { 921 else {
922 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 922 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
923 nr_pages = -nr_pages; /* for event */ 923 nr_pages = -nr_pages; /* for event */
924 } 924 }
925 925
926 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 926 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
927 } 927 }
928 928
929 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 929 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
930 { 930 {
931 struct mem_cgroup_per_zone *mz; 931 struct mem_cgroup_per_zone *mz;
932 932
933 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 933 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
934 return mz->lru_size[lru]; 934 return mz->lru_size[lru];
935 } 935 }
936 936
937 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 937 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
938 int nid, 938 int nid,
939 unsigned int lru_mask) 939 unsigned int lru_mask)
940 { 940 {
941 unsigned long nr = 0; 941 unsigned long nr = 0;
942 int zid; 942 int zid;
943 943
944 VM_BUG_ON((unsigned)nid >= nr_node_ids); 944 VM_BUG_ON((unsigned)nid >= nr_node_ids);
945 945
946 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 946 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
947 struct mem_cgroup_per_zone *mz; 947 struct mem_cgroup_per_zone *mz;
948 enum lru_list lru; 948 enum lru_list lru;
949 949
950 for_each_lru(lru) { 950 for_each_lru(lru) {
951 if (!(BIT(lru) & lru_mask)) 951 if (!(BIT(lru) & lru_mask))
952 continue; 952 continue;
953 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 953 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
954 nr += mz->lru_size[lru]; 954 nr += mz->lru_size[lru];
955 } 955 }
956 } 956 }
957 return nr; 957 return nr;
958 } 958 }
959 959
960 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 960 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
961 unsigned int lru_mask) 961 unsigned int lru_mask)
962 { 962 {
963 unsigned long nr = 0; 963 unsigned long nr = 0;
964 int nid; 964 int nid;
965 965
966 for_each_node_state(nid, N_MEMORY) 966 for_each_node_state(nid, N_MEMORY)
967 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 967 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
968 return nr; 968 return nr;
969 } 969 }
970 970
971 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 971 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
972 enum mem_cgroup_events_target target) 972 enum mem_cgroup_events_target target)
973 { 973 {
974 unsigned long val, next; 974 unsigned long val, next;
975 975
976 val = __this_cpu_read(memcg->stat->nr_page_events); 976 val = __this_cpu_read(memcg->stat->nr_page_events);
977 next = __this_cpu_read(memcg->stat->targets[target]); 977 next = __this_cpu_read(memcg->stat->targets[target]);
978 /* from time_after() in jiffies.h */ 978 /* from time_after() in jiffies.h */
979 if ((long)next - (long)val < 0) { 979 if ((long)next - (long)val < 0) {
980 switch (target) { 980 switch (target) {
981 case MEM_CGROUP_TARGET_THRESH: 981 case MEM_CGROUP_TARGET_THRESH:
982 next = val + THRESHOLDS_EVENTS_TARGET; 982 next = val + THRESHOLDS_EVENTS_TARGET;
983 break; 983 break;
984 case MEM_CGROUP_TARGET_SOFTLIMIT: 984 case MEM_CGROUP_TARGET_SOFTLIMIT:
985 next = val + SOFTLIMIT_EVENTS_TARGET; 985 next = val + SOFTLIMIT_EVENTS_TARGET;
986 break; 986 break;
987 case MEM_CGROUP_TARGET_NUMAINFO: 987 case MEM_CGROUP_TARGET_NUMAINFO:
988 next = val + NUMAINFO_EVENTS_TARGET; 988 next = val + NUMAINFO_EVENTS_TARGET;
989 break; 989 break;
990 default: 990 default:
991 break; 991 break;
992 } 992 }
993 __this_cpu_write(memcg->stat->targets[target], next); 993 __this_cpu_write(memcg->stat->targets[target], next);
994 return true; 994 return true;
995 } 995 }
996 return false; 996 return false;
997 } 997 }
998 998
999 /* 999 /*
1000 * Check events in order. 1000 * Check events in order.
1001 * 1001 *
1002 */ 1002 */
1003 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1003 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1004 { 1004 {
1005 /* threshold event is triggered in finer grain than soft limit */ 1005 /* threshold event is triggered in finer grain than soft limit */
1006 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1006 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1007 MEM_CGROUP_TARGET_THRESH))) { 1007 MEM_CGROUP_TARGET_THRESH))) {
1008 bool do_softlimit; 1008 bool do_softlimit;
1009 bool do_numainfo __maybe_unused; 1009 bool do_numainfo __maybe_unused;
1010 1010
1011 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1011 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1012 MEM_CGROUP_TARGET_SOFTLIMIT); 1012 MEM_CGROUP_TARGET_SOFTLIMIT);
1013 #if MAX_NUMNODES > 1 1013 #if MAX_NUMNODES > 1
1014 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1014 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1015 MEM_CGROUP_TARGET_NUMAINFO); 1015 MEM_CGROUP_TARGET_NUMAINFO);
1016 #endif 1016 #endif
1017 mem_cgroup_threshold(memcg); 1017 mem_cgroup_threshold(memcg);
1018 if (unlikely(do_softlimit)) 1018 if (unlikely(do_softlimit))
1019 mem_cgroup_update_tree(memcg, page); 1019 mem_cgroup_update_tree(memcg, page);
1020 #if MAX_NUMNODES > 1 1020 #if MAX_NUMNODES > 1
1021 if (unlikely(do_numainfo)) 1021 if (unlikely(do_numainfo))
1022 atomic_inc(&memcg->numainfo_events); 1022 atomic_inc(&memcg->numainfo_events);
1023 #endif 1023 #endif
1024 } 1024 }
1025 } 1025 }
1026 1026
1027 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1027 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1028 { 1028 {
1029 /* 1029 /*
1030 * mm_update_next_owner() may clear mm->owner to NULL 1030 * mm_update_next_owner() may clear mm->owner to NULL
1031 * if it races with swapoff, page migration, etc. 1031 * if it races with swapoff, page migration, etc.
1032 * So this can be called with p == NULL. 1032 * So this can be called with p == NULL.
1033 */ 1033 */
1034 if (unlikely(!p)) 1034 if (unlikely(!p))
1035 return NULL; 1035 return NULL;
1036 1036
1037 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1037 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1038 } 1038 }
1039 1039
1040 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1040 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1041 { 1041 {
1042 struct mem_cgroup *memcg = NULL; 1042 struct mem_cgroup *memcg = NULL;
1043 1043
1044 rcu_read_lock(); 1044 rcu_read_lock();
1045 do { 1045 do {
1046 /* 1046 /*
1047 * Page cache insertions can happen withou an 1047 * Page cache insertions can happen withou an
1048 * actual mm context, e.g. during disk probing 1048 * actual mm context, e.g. during disk probing
1049 * on boot, loopback IO, acct() writes etc. 1049 * on boot, loopback IO, acct() writes etc.
1050 */ 1050 */
1051 if (unlikely(!mm)) 1051 if (unlikely(!mm))
1052 memcg = root_mem_cgroup; 1052 memcg = root_mem_cgroup;
1053 else { 1053 else {
1054 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1054 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1055 if (unlikely(!memcg)) 1055 if (unlikely(!memcg))
1056 memcg = root_mem_cgroup; 1056 memcg = root_mem_cgroup;
1057 } 1057 }
1058 } while (!css_tryget_online(&memcg->css)); 1058 } while (!css_tryget_online(&memcg->css));
1059 rcu_read_unlock(); 1059 rcu_read_unlock();
1060 return memcg; 1060 return memcg;
1061 } 1061 }
1062 1062
1063 /** 1063 /**
1064 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1064 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1065 * @root: hierarchy root 1065 * @root: hierarchy root
1066 * @prev: previously returned memcg, NULL on first invocation 1066 * @prev: previously returned memcg, NULL on first invocation
1067 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1067 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1068 * 1068 *
1069 * Returns references to children of the hierarchy below @root, or 1069 * Returns references to children of the hierarchy below @root, or
1070 * @root itself, or %NULL after a full round-trip. 1070 * @root itself, or %NULL after a full round-trip.
1071 * 1071 *
1072 * Caller must pass the return value in @prev on subsequent 1072 * Caller must pass the return value in @prev on subsequent
1073 * invocations for reference counting, or use mem_cgroup_iter_break() 1073 * invocations for reference counting, or use mem_cgroup_iter_break()
1074 * to cancel a hierarchy walk before the round-trip is complete. 1074 * to cancel a hierarchy walk before the round-trip is complete.
1075 * 1075 *
1076 * Reclaimers can specify a zone and a priority level in @reclaim to 1076 * Reclaimers can specify a zone and a priority level in @reclaim to
1077 * divide up the memcgs in the hierarchy among all concurrent 1077 * divide up the memcgs in the hierarchy among all concurrent
1078 * reclaimers operating on the same zone and priority. 1078 * reclaimers operating on the same zone and priority.
1079 */ 1079 */
1080 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1080 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1081 struct mem_cgroup *prev, 1081 struct mem_cgroup *prev,
1082 struct mem_cgroup_reclaim_cookie *reclaim) 1082 struct mem_cgroup_reclaim_cookie *reclaim)
1083 { 1083 {
1084 struct reclaim_iter *uninitialized_var(iter); 1084 struct reclaim_iter *uninitialized_var(iter);
1085 struct cgroup_subsys_state *css = NULL; 1085 struct cgroup_subsys_state *css = NULL;
1086 struct mem_cgroup *memcg = NULL; 1086 struct mem_cgroup *memcg = NULL;
1087 struct mem_cgroup *pos = NULL; 1087 struct mem_cgroup *pos = NULL;
1088 1088
1089 if (mem_cgroup_disabled()) 1089 if (mem_cgroup_disabled())
1090 return NULL; 1090 return NULL;
1091 1091
1092 if (!root) 1092 if (!root)
1093 root = root_mem_cgroup; 1093 root = root_mem_cgroup;
1094 1094
1095 if (prev && !reclaim) 1095 if (prev && !reclaim)
1096 pos = prev; 1096 pos = prev;
1097 1097
1098 if (!root->use_hierarchy && root != root_mem_cgroup) { 1098 if (!root->use_hierarchy && root != root_mem_cgroup) {
1099 if (prev) 1099 if (prev)
1100 goto out; 1100 goto out;
1101 return root; 1101 return root;
1102 } 1102 }
1103 1103
1104 rcu_read_lock(); 1104 rcu_read_lock();
1105 1105
1106 if (reclaim) { 1106 if (reclaim) {
1107 struct mem_cgroup_per_zone *mz; 1107 struct mem_cgroup_per_zone *mz;
1108 1108
1109 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); 1109 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1110 iter = &mz->iter[reclaim->priority]; 1110 iter = &mz->iter[reclaim->priority];
1111 1111
1112 if (prev && reclaim->generation != iter->generation) 1112 if (prev && reclaim->generation != iter->generation)
1113 goto out_unlock; 1113 goto out_unlock;
1114 1114
1115 do { 1115 do {
1116 pos = ACCESS_ONCE(iter->position); 1116 pos = ACCESS_ONCE(iter->position);
1117 /* 1117 /*
1118 * A racing update may change the position and 1118 * A racing update may change the position and
1119 * put the last reference, hence css_tryget(), 1119 * put the last reference, hence css_tryget(),
1120 * or retry to see the updated position. 1120 * or retry to see the updated position.
1121 */ 1121 */
1122 } while (pos && !css_tryget(&pos->css)); 1122 } while (pos && !css_tryget(&pos->css));
1123 } 1123 }
1124 1124
1125 if (pos) 1125 if (pos)
1126 css = &pos->css; 1126 css = &pos->css;
1127 1127
1128 for (;;) { 1128 for (;;) {
1129 css = css_next_descendant_pre(css, &root->css); 1129 css = css_next_descendant_pre(css, &root->css);
1130 if (!css) { 1130 if (!css) {
1131 /* 1131 /*
1132 * Reclaimers share the hierarchy walk, and a 1132 * Reclaimers share the hierarchy walk, and a
1133 * new one might jump in right at the end of 1133 * new one might jump in right at the end of
1134 * the hierarchy - make sure they see at least 1134 * the hierarchy - make sure they see at least
1135 * one group and restart from the beginning. 1135 * one group and restart from the beginning.
1136 */ 1136 */
1137 if (!prev) 1137 if (!prev)
1138 continue; 1138 continue;
1139 break; 1139 break;
1140 } 1140 }
1141 1141
1142 /* 1142 /*
1143 * Verify the css and acquire a reference. The root 1143 * Verify the css and acquire a reference. The root
1144 * is provided by the caller, so we know it's alive 1144 * is provided by the caller, so we know it's alive
1145 * and kicking, and don't take an extra reference. 1145 * and kicking, and don't take an extra reference.
1146 */ 1146 */
1147 memcg = mem_cgroup_from_css(css); 1147 memcg = mem_cgroup_from_css(css);
1148 1148
1149 if (css == &root->css) 1149 if (css == &root->css)
1150 break; 1150 break;
1151 1151
1152 if (css_tryget_online(css)) { 1152 if (css_tryget_online(css)) {
1153 /* 1153 /*
1154 * Make sure the memcg is initialized: 1154 * Make sure the memcg is initialized:
1155 * mem_cgroup_css_online() orders the the 1155 * mem_cgroup_css_online() orders the the
1156 * initialization against setting the flag. 1156 * initialization against setting the flag.
1157 */ 1157 */
1158 if (smp_load_acquire(&memcg->initialized)) 1158 if (smp_load_acquire(&memcg->initialized))
1159 break; 1159 break;
1160 1160
1161 css_put(css); 1161 css_put(css);
1162 } 1162 }
1163 1163
1164 memcg = NULL; 1164 memcg = NULL;
1165 } 1165 }
1166 1166
1167 if (reclaim) { 1167 if (reclaim) {
1168 if (cmpxchg(&iter->position, pos, memcg) == pos) { 1168 if (cmpxchg(&iter->position, pos, memcg) == pos) {
1169 if (memcg) 1169 if (memcg)
1170 css_get(&memcg->css); 1170 css_get(&memcg->css);
1171 if (pos) 1171 if (pos)
1172 css_put(&pos->css); 1172 css_put(&pos->css);
1173 } 1173 }
1174 1174
1175 /* 1175 /*
1176 * pairs with css_tryget when dereferencing iter->position 1176 * pairs with css_tryget when dereferencing iter->position
1177 * above. 1177 * above.
1178 */ 1178 */
1179 if (pos) 1179 if (pos)
1180 css_put(&pos->css); 1180 css_put(&pos->css);
1181 1181
1182 if (!memcg) 1182 if (!memcg)
1183 iter->generation++; 1183 iter->generation++;
1184 else if (!prev) 1184 else if (!prev)
1185 reclaim->generation = iter->generation; 1185 reclaim->generation = iter->generation;
1186 } 1186 }
1187 1187
1188 out_unlock: 1188 out_unlock:
1189 rcu_read_unlock(); 1189 rcu_read_unlock();
1190 out: 1190 out:
1191 if (prev && prev != root) 1191 if (prev && prev != root)
1192 css_put(&prev->css); 1192 css_put(&prev->css);
1193 1193
1194 return memcg; 1194 return memcg;
1195 } 1195 }
1196 1196
1197 /** 1197 /**
1198 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1198 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1199 * @root: hierarchy root 1199 * @root: hierarchy root
1200 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1200 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1201 */ 1201 */
1202 void mem_cgroup_iter_break(struct mem_cgroup *root, 1202 void mem_cgroup_iter_break(struct mem_cgroup *root,
1203 struct mem_cgroup *prev) 1203 struct mem_cgroup *prev)
1204 { 1204 {
1205 if (!root) 1205 if (!root)
1206 root = root_mem_cgroup; 1206 root = root_mem_cgroup;
1207 if (prev && prev != root) 1207 if (prev && prev != root)
1208 css_put(&prev->css); 1208 css_put(&prev->css);
1209 } 1209 }
1210 1210
1211 /* 1211 /*
1212 * Iteration constructs for visiting all cgroups (under a tree). If 1212 * Iteration constructs for visiting all cgroups (under a tree). If
1213 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1213 * loops are exited prematurely (break), mem_cgroup_iter_break() must
1214 * be used for reference counting. 1214 * be used for reference counting.
1215 */ 1215 */
1216 #define for_each_mem_cgroup_tree(iter, root) \ 1216 #define for_each_mem_cgroup_tree(iter, root) \
1217 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1217 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1218 iter != NULL; \ 1218 iter != NULL; \
1219 iter = mem_cgroup_iter(root, iter, NULL)) 1219 iter = mem_cgroup_iter(root, iter, NULL))
1220 1220
1221 #define for_each_mem_cgroup(iter) \ 1221 #define for_each_mem_cgroup(iter) \
1222 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1222 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1223 iter != NULL; \ 1223 iter != NULL; \
1224 iter = mem_cgroup_iter(NULL, iter, NULL)) 1224 iter = mem_cgroup_iter(NULL, iter, NULL))
1225 1225
1226 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1226 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1227 { 1227 {
1228 struct mem_cgroup *memcg; 1228 struct mem_cgroup *memcg;
1229 1229
1230 rcu_read_lock(); 1230 rcu_read_lock();
1231 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1231 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1232 if (unlikely(!memcg)) 1232 if (unlikely(!memcg))
1233 goto out; 1233 goto out;
1234 1234
1235 switch (idx) { 1235 switch (idx) {
1236 case PGFAULT: 1236 case PGFAULT:
1237 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1237 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1238 break; 1238 break;
1239 case PGMAJFAULT: 1239 case PGMAJFAULT:
1240 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1240 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1241 break; 1241 break;
1242 default: 1242 default:
1243 BUG(); 1243 BUG();
1244 } 1244 }
1245 out: 1245 out:
1246 rcu_read_unlock(); 1246 rcu_read_unlock();
1247 } 1247 }
1248 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1248 EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1249 1249
1250 /** 1250 /**
1251 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1251 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1252 * @zone: zone of the wanted lruvec 1252 * @zone: zone of the wanted lruvec
1253 * @memcg: memcg of the wanted lruvec 1253 * @memcg: memcg of the wanted lruvec
1254 * 1254 *
1255 * Returns the lru list vector holding pages for the given @zone and 1255 * Returns the lru list vector holding pages for the given @zone and
1256 * @mem. This can be the global zone lruvec, if the memory controller 1256 * @mem. This can be the global zone lruvec, if the memory controller
1257 * is disabled. 1257 * is disabled.
1258 */ 1258 */
1259 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1259 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1260 struct mem_cgroup *memcg) 1260 struct mem_cgroup *memcg)
1261 { 1261 {
1262 struct mem_cgroup_per_zone *mz; 1262 struct mem_cgroup_per_zone *mz;
1263 struct lruvec *lruvec; 1263 struct lruvec *lruvec;
1264 1264
1265 if (mem_cgroup_disabled()) { 1265 if (mem_cgroup_disabled()) {
1266 lruvec = &zone->lruvec; 1266 lruvec = &zone->lruvec;
1267 goto out; 1267 goto out;
1268 } 1268 }
1269 1269
1270 mz = mem_cgroup_zone_zoneinfo(memcg, zone); 1270 mz = mem_cgroup_zone_zoneinfo(memcg, zone);
1271 lruvec = &mz->lruvec; 1271 lruvec = &mz->lruvec;
1272 out: 1272 out:
1273 /* 1273 /*
1274 * Since a node can be onlined after the mem_cgroup was created, 1274 * Since a node can be onlined after the mem_cgroup was created,
1275 * we have to be prepared to initialize lruvec->zone here; 1275 * we have to be prepared to initialize lruvec->zone here;
1276 * and if offlined then reonlined, we need to reinitialize it. 1276 * and if offlined then reonlined, we need to reinitialize it.
1277 */ 1277 */
1278 if (unlikely(lruvec->zone != zone)) 1278 if (unlikely(lruvec->zone != zone))
1279 lruvec->zone = zone; 1279 lruvec->zone = zone;
1280 return lruvec; 1280 return lruvec;
1281 } 1281 }
1282 1282
1283 /** 1283 /**
1284 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1284 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1285 * @page: the page 1285 * @page: the page
1286 * @zone: zone of the page 1286 * @zone: zone of the page
1287 */ 1287 */
1288 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1288 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1289 { 1289 {
1290 struct mem_cgroup_per_zone *mz; 1290 struct mem_cgroup_per_zone *mz;
1291 struct mem_cgroup *memcg; 1291 struct mem_cgroup *memcg;
1292 struct page_cgroup *pc; 1292 struct page_cgroup *pc;
1293 struct lruvec *lruvec; 1293 struct lruvec *lruvec;
1294 1294
1295 if (mem_cgroup_disabled()) { 1295 if (mem_cgroup_disabled()) {
1296 lruvec = &zone->lruvec; 1296 lruvec = &zone->lruvec;
1297 goto out; 1297 goto out;
1298 } 1298 }
1299 1299
1300 pc = lookup_page_cgroup(page); 1300 pc = lookup_page_cgroup(page);
1301 memcg = pc->mem_cgroup; 1301 memcg = pc->mem_cgroup;
1302 1302
1303 /* 1303 /*
1304 * Surreptitiously switch any uncharged offlist page to root: 1304 * Surreptitiously switch any uncharged offlist page to root:
1305 * an uncharged page off lru does nothing to secure 1305 * an uncharged page off lru does nothing to secure
1306 * its former mem_cgroup from sudden removal. 1306 * its former mem_cgroup from sudden removal.
1307 * 1307 *
1308 * Our caller holds lru_lock, and PageCgroupUsed is updated 1308 * Our caller holds lru_lock, and PageCgroupUsed is updated
1309 * under page_cgroup lock: between them, they make all uses 1309 * under page_cgroup lock: between them, they make all uses
1310 * of pc->mem_cgroup safe. 1310 * of pc->mem_cgroup safe.
1311 */ 1311 */
1312 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1312 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1313 pc->mem_cgroup = memcg = root_mem_cgroup; 1313 pc->mem_cgroup = memcg = root_mem_cgroup;
1314 1314
1315 mz = mem_cgroup_page_zoneinfo(memcg, page); 1315 mz = mem_cgroup_page_zoneinfo(memcg, page);
1316 lruvec = &mz->lruvec; 1316 lruvec = &mz->lruvec;
1317 out: 1317 out:
1318 /* 1318 /*
1319 * Since a node can be onlined after the mem_cgroup was created, 1319 * Since a node can be onlined after the mem_cgroup was created,
1320 * we have to be prepared to initialize lruvec->zone here; 1320 * we have to be prepared to initialize lruvec->zone here;
1321 * and if offlined then reonlined, we need to reinitialize it. 1321 * and if offlined then reonlined, we need to reinitialize it.
1322 */ 1322 */
1323 if (unlikely(lruvec->zone != zone)) 1323 if (unlikely(lruvec->zone != zone))
1324 lruvec->zone = zone; 1324 lruvec->zone = zone;
1325 return lruvec; 1325 return lruvec;
1326 } 1326 }
1327 1327
1328 /** 1328 /**
1329 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1329 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1330 * @lruvec: mem_cgroup per zone lru vector 1330 * @lruvec: mem_cgroup per zone lru vector
1331 * @lru: index of lru list the page is sitting on 1331 * @lru: index of lru list the page is sitting on
1332 * @nr_pages: positive when adding or negative when removing 1332 * @nr_pages: positive when adding or negative when removing
1333 * 1333 *
1334 * This function must be called when a page is added to or removed from an 1334 * This function must be called when a page is added to or removed from an
1335 * lru list. 1335 * lru list.
1336 */ 1336 */
1337 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1337 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1338 int nr_pages) 1338 int nr_pages)
1339 { 1339 {
1340 struct mem_cgroup_per_zone *mz; 1340 struct mem_cgroup_per_zone *mz;
1341 unsigned long *lru_size; 1341 unsigned long *lru_size;
1342 1342
1343 if (mem_cgroup_disabled()) 1343 if (mem_cgroup_disabled())
1344 return; 1344 return;
1345 1345
1346 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1346 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1347 lru_size = mz->lru_size + lru; 1347 lru_size = mz->lru_size + lru;
1348 *lru_size += nr_pages; 1348 *lru_size += nr_pages;
1349 VM_BUG_ON((long)(*lru_size) < 0); 1349 VM_BUG_ON((long)(*lru_size) < 0);
1350 } 1350 }
1351 1351
1352 /* 1352 /*
1353 * Checks whether given mem is same or in the root_mem_cgroup's 1353 * Checks whether given mem is same or in the root_mem_cgroup's
1354 * hierarchy subtree 1354 * hierarchy subtree
1355 */ 1355 */
1356 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1356 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1357 struct mem_cgroup *memcg) 1357 struct mem_cgroup *memcg)
1358 { 1358 {
1359 if (root_memcg == memcg) 1359 if (root_memcg == memcg)
1360 return true; 1360 return true;
1361 if (!root_memcg->use_hierarchy || !memcg) 1361 if (!root_memcg->use_hierarchy || !memcg)
1362 return false; 1362 return false;
1363 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); 1363 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1364 } 1364 }
1365 1365
1366 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1366 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1367 struct mem_cgroup *memcg) 1367 struct mem_cgroup *memcg)
1368 { 1368 {
1369 bool ret; 1369 bool ret;
1370 1370
1371 rcu_read_lock(); 1371 rcu_read_lock();
1372 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1372 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1373 rcu_read_unlock(); 1373 rcu_read_unlock();
1374 return ret; 1374 return ret;
1375 } 1375 }
1376 1376
1377 bool task_in_mem_cgroup(struct task_struct *task, 1377 bool task_in_mem_cgroup(struct task_struct *task,
1378 const struct mem_cgroup *memcg) 1378 const struct mem_cgroup *memcg)
1379 { 1379 {
1380 struct mem_cgroup *curr = NULL; 1380 struct mem_cgroup *curr = NULL;
1381 struct task_struct *p; 1381 struct task_struct *p;
1382 bool ret; 1382 bool ret;
1383 1383
1384 p = find_lock_task_mm(task); 1384 p = find_lock_task_mm(task);
1385 if (p) { 1385 if (p) {
1386 curr = get_mem_cgroup_from_mm(p->mm); 1386 curr = get_mem_cgroup_from_mm(p->mm);
1387 task_unlock(p); 1387 task_unlock(p);
1388 } else { 1388 } else {
1389 /* 1389 /*
1390 * All threads may have already detached their mm's, but the oom 1390 * All threads may have already detached their mm's, but the oom
1391 * killer still needs to detect if they have already been oom 1391 * killer still needs to detect if they have already been oom
1392 * killed to prevent needlessly killing additional tasks. 1392 * killed to prevent needlessly killing additional tasks.
1393 */ 1393 */
1394 rcu_read_lock(); 1394 rcu_read_lock();
1395 curr = mem_cgroup_from_task(task); 1395 curr = mem_cgroup_from_task(task);
1396 if (curr) 1396 if (curr)
1397 css_get(&curr->css); 1397 css_get(&curr->css);
1398 rcu_read_unlock(); 1398 rcu_read_unlock();
1399 } 1399 }
1400 /* 1400 /*
1401 * We should check use_hierarchy of "memcg" not "curr". Because checking 1401 * We should check use_hierarchy of "memcg" not "curr". Because checking
1402 * use_hierarchy of "curr" here make this function true if hierarchy is 1402 * use_hierarchy of "curr" here make this function true if hierarchy is
1403 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1403 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1404 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1404 * hierarchy(even if use_hierarchy is disabled in "memcg").
1405 */ 1405 */
1406 ret = mem_cgroup_same_or_subtree(memcg, curr); 1406 ret = mem_cgroup_same_or_subtree(memcg, curr);
1407 css_put(&curr->css); 1407 css_put(&curr->css);
1408 return ret; 1408 return ret;
1409 } 1409 }
1410 1410
1411 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1411 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1412 { 1412 {
1413 unsigned long inactive_ratio; 1413 unsigned long inactive_ratio;
1414 unsigned long inactive; 1414 unsigned long inactive;
1415 unsigned long active; 1415 unsigned long active;
1416 unsigned long gb; 1416 unsigned long gb;
1417 1417
1418 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1418 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1419 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1419 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1420 1420
1421 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1421 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1422 if (gb) 1422 if (gb)
1423 inactive_ratio = int_sqrt(10 * gb); 1423 inactive_ratio = int_sqrt(10 * gb);
1424 else 1424 else
1425 inactive_ratio = 1; 1425 inactive_ratio = 1;
1426 1426
1427 return inactive * inactive_ratio < active; 1427 return inactive * inactive_ratio < active;
1428 } 1428 }
1429 1429
1430 #define mem_cgroup_from_counter(counter, member) \ 1430 #define mem_cgroup_from_counter(counter, member) \
1431 container_of(counter, struct mem_cgroup, member) 1431 container_of(counter, struct mem_cgroup, member)
1432 1432
1433 /** 1433 /**
1434 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1434 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1435 * @memcg: the memory cgroup 1435 * @memcg: the memory cgroup
1436 * 1436 *
1437 * Returns the maximum amount of memory @mem can be charged with, in 1437 * Returns the maximum amount of memory @mem can be charged with, in
1438 * pages. 1438 * pages.
1439 */ 1439 */
1440 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1440 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1441 { 1441 {
1442 unsigned long margin = 0; 1442 unsigned long margin = 0;
1443 unsigned long count; 1443 unsigned long count;
1444 unsigned long limit; 1444 unsigned long limit;
1445 1445
1446 count = page_counter_read(&memcg->memory); 1446 count = page_counter_read(&memcg->memory);
1447 limit = ACCESS_ONCE(memcg->memory.limit); 1447 limit = ACCESS_ONCE(memcg->memory.limit);
1448 if (count < limit) 1448 if (count < limit)
1449 margin = limit - count; 1449 margin = limit - count;
1450 1450
1451 if (do_swap_account) { 1451 if (do_swap_account) {
1452 count = page_counter_read(&memcg->memsw); 1452 count = page_counter_read(&memcg->memsw);
1453 limit = ACCESS_ONCE(memcg->memsw.limit); 1453 limit = ACCESS_ONCE(memcg->memsw.limit);
1454 if (count <= limit) 1454 if (count <= limit)
1455 margin = min(margin, limit - count); 1455 margin = min(margin, limit - count);
1456 } 1456 }
1457 1457
1458 return margin; 1458 return margin;
1459 } 1459 }
1460 1460
1461 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1461 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1462 { 1462 {
1463 /* root ? */ 1463 /* root ? */
1464 if (mem_cgroup_disabled() || !memcg->css.parent) 1464 if (mem_cgroup_disabled() || !memcg->css.parent)
1465 return vm_swappiness; 1465 return vm_swappiness;
1466 1466
1467 return memcg->swappiness; 1467 return memcg->swappiness;
1468 } 1468 }
1469 1469
1470 /* 1470 /*
1471 * memcg->moving_account is used for checking possibility that some thread is 1471 * memcg->moving_account is used for checking possibility that some thread is
1472 * calling move_account(). When a thread on CPU-A starts moving pages under 1472 * calling move_account(). When a thread on CPU-A starts moving pages under
1473 * a memcg, other threads should check memcg->moving_account under 1473 * a memcg, other threads should check memcg->moving_account under
1474 * rcu_read_lock(), like this: 1474 * rcu_read_lock(), like this:
1475 * 1475 *
1476 * CPU-A CPU-B 1476 * CPU-A CPU-B
1477 * rcu_read_lock() 1477 * rcu_read_lock()
1478 * memcg->moving_account+1 if (memcg->mocing_account) 1478 * memcg->moving_account+1 if (memcg->mocing_account)
1479 * take heavy locks. 1479 * take heavy locks.
1480 * synchronize_rcu() update something. 1480 * synchronize_rcu() update something.
1481 * rcu_read_unlock() 1481 * rcu_read_unlock()
1482 * start move here. 1482 * start move here.
1483 */ 1483 */
1484 1484
1485 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1485 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1486 { 1486 {
1487 atomic_inc(&memcg->moving_account); 1487 atomic_inc(&memcg->moving_account);
1488 synchronize_rcu(); 1488 synchronize_rcu();
1489 } 1489 }
1490 1490
1491 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1491 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1492 { 1492 {
1493 /* 1493 /*
1494 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1494 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1495 * We check NULL in callee rather than caller. 1495 * We check NULL in callee rather than caller.
1496 */ 1496 */
1497 if (memcg) 1497 if (memcg)
1498 atomic_dec(&memcg->moving_account); 1498 atomic_dec(&memcg->moving_account);
1499 } 1499 }
1500 1500
1501 /* 1501 /*
1502 * A routine for checking "mem" is under move_account() or not. 1502 * A routine for checking "mem" is under move_account() or not.
1503 * 1503 *
1504 * Checking a cgroup is mc.from or mc.to or under hierarchy of 1504 * Checking a cgroup is mc.from or mc.to or under hierarchy of
1505 * moving cgroups. This is for waiting at high-memory pressure 1505 * moving cgroups. This is for waiting at high-memory pressure
1506 * caused by "move". 1506 * caused by "move".
1507 */ 1507 */
1508 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1508 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1509 { 1509 {
1510 struct mem_cgroup *from; 1510 struct mem_cgroup *from;
1511 struct mem_cgroup *to; 1511 struct mem_cgroup *to;
1512 bool ret = false; 1512 bool ret = false;
1513 /* 1513 /*
1514 * Unlike task_move routines, we access mc.to, mc.from not under 1514 * Unlike task_move routines, we access mc.to, mc.from not under
1515 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1515 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1516 */ 1516 */
1517 spin_lock(&mc.lock); 1517 spin_lock(&mc.lock);
1518 from = mc.from; 1518 from = mc.from;
1519 to = mc.to; 1519 to = mc.to;
1520 if (!from) 1520 if (!from)
1521 goto unlock; 1521 goto unlock;
1522 1522
1523 ret = mem_cgroup_same_or_subtree(memcg, from) 1523 ret = mem_cgroup_same_or_subtree(memcg, from)
1524 || mem_cgroup_same_or_subtree(memcg, to); 1524 || mem_cgroup_same_or_subtree(memcg, to);
1525 unlock: 1525 unlock:
1526 spin_unlock(&mc.lock); 1526 spin_unlock(&mc.lock);
1527 return ret; 1527 return ret;
1528 } 1528 }
1529 1529
1530 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1530 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1531 { 1531 {
1532 if (mc.moving_task && current != mc.moving_task) { 1532 if (mc.moving_task && current != mc.moving_task) {
1533 if (mem_cgroup_under_move(memcg)) { 1533 if (mem_cgroup_under_move(memcg)) {
1534 DEFINE_WAIT(wait); 1534 DEFINE_WAIT(wait);
1535 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1535 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1536 /* moving charge context might have finished. */ 1536 /* moving charge context might have finished. */
1537 if (mc.moving_task) 1537 if (mc.moving_task)
1538 schedule(); 1538 schedule();
1539 finish_wait(&mc.waitq, &wait); 1539 finish_wait(&mc.waitq, &wait);
1540 return true; 1540 return true;
1541 } 1541 }
1542 } 1542 }
1543 return false; 1543 return false;
1544 } 1544 }
1545 1545
1546 /* 1546 /*
1547 * Take this lock when 1547 * Take this lock when
1548 * - a code tries to modify page's memcg while it's USED. 1548 * - a code tries to modify page's memcg while it's USED.
1549 * - a code tries to modify page state accounting in a memcg. 1549 * - a code tries to modify page state accounting in a memcg.
1550 */ 1550 */
1551 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1551 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1552 unsigned long *flags) 1552 unsigned long *flags)
1553 { 1553 {
1554 spin_lock_irqsave(&memcg->move_lock, *flags); 1554 spin_lock_irqsave(&memcg->move_lock, *flags);
1555 } 1555 }
1556 1556
1557 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1557 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1558 unsigned long *flags) 1558 unsigned long *flags)
1559 { 1559 {
1560 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1560 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1561 } 1561 }
1562 1562
1563 #define K(x) ((x) << (PAGE_SHIFT-10)) 1563 #define K(x) ((x) << (PAGE_SHIFT-10))
1564 /** 1564 /**
1565 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1565 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1566 * @memcg: The memory cgroup that went over limit 1566 * @memcg: The memory cgroup that went over limit
1567 * @p: Task that is going to be killed 1567 * @p: Task that is going to be killed
1568 * 1568 *
1569 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1569 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1570 * enabled 1570 * enabled
1571 */ 1571 */
1572 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1572 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1573 { 1573 {
1574 /* oom_info_lock ensures that parallel ooms do not interleave */ 1574 /* oom_info_lock ensures that parallel ooms do not interleave */
1575 static DEFINE_MUTEX(oom_info_lock); 1575 static DEFINE_MUTEX(oom_info_lock);
1576 struct mem_cgroup *iter; 1576 struct mem_cgroup *iter;
1577 unsigned int i; 1577 unsigned int i;
1578 1578
1579 if (!p) 1579 if (!p)
1580 return; 1580 return;
1581 1581
1582 mutex_lock(&oom_info_lock); 1582 mutex_lock(&oom_info_lock);
1583 rcu_read_lock(); 1583 rcu_read_lock();
1584 1584
1585 pr_info("Task in "); 1585 pr_info("Task in ");
1586 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1586 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1587 pr_info(" killed as a result of limit of "); 1587 pr_info(" killed as a result of limit of ");
1588 pr_cont_cgroup_path(memcg->css.cgroup); 1588 pr_cont_cgroup_path(memcg->css.cgroup);
1589 pr_info("\n"); 1589 pr_info("\n");
1590 1590
1591 rcu_read_unlock(); 1591 rcu_read_unlock();
1592 1592
1593 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", 1593 pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1594 K((u64)page_counter_read(&memcg->memory)), 1594 K((u64)page_counter_read(&memcg->memory)),
1595 K((u64)memcg->memory.limit), memcg->memory.failcnt); 1595 K((u64)memcg->memory.limit), memcg->memory.failcnt);
1596 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", 1596 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1597 K((u64)page_counter_read(&memcg->memsw)), 1597 K((u64)page_counter_read(&memcg->memsw)),
1598 K((u64)memcg->memsw.limit), memcg->memsw.failcnt); 1598 K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1599 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", 1599 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1600 K((u64)page_counter_read(&memcg->kmem)), 1600 K((u64)page_counter_read(&memcg->kmem)),
1601 K((u64)memcg->kmem.limit), memcg->kmem.failcnt); 1601 K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1602 1602
1603 for_each_mem_cgroup_tree(iter, memcg) { 1603 for_each_mem_cgroup_tree(iter, memcg) {
1604 pr_info("Memory cgroup stats for "); 1604 pr_info("Memory cgroup stats for ");
1605 pr_cont_cgroup_path(iter->css.cgroup); 1605 pr_cont_cgroup_path(iter->css.cgroup);
1606 pr_cont(":"); 1606 pr_cont(":");
1607 1607
1608 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1608 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1609 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1609 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1610 continue; 1610 continue;
1611 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1611 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1612 K(mem_cgroup_read_stat(iter, i))); 1612 K(mem_cgroup_read_stat(iter, i)));
1613 } 1613 }
1614 1614
1615 for (i = 0; i < NR_LRU_LISTS; i++) 1615 for (i = 0; i < NR_LRU_LISTS; i++)
1616 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1616 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1617 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1617 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1618 1618
1619 pr_cont("\n"); 1619 pr_cont("\n");
1620 } 1620 }
1621 mutex_unlock(&oom_info_lock); 1621 mutex_unlock(&oom_info_lock);
1622 } 1622 }
1623 1623
1624 /* 1624 /*
1625 * This function returns the number of memcg under hierarchy tree. Returns 1625 * This function returns the number of memcg under hierarchy tree. Returns
1626 * 1(self count) if no children. 1626 * 1(self count) if no children.
1627 */ 1627 */
1628 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1628 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1629 { 1629 {
1630 int num = 0; 1630 int num = 0;
1631 struct mem_cgroup *iter; 1631 struct mem_cgroup *iter;
1632 1632
1633 for_each_mem_cgroup_tree(iter, memcg) 1633 for_each_mem_cgroup_tree(iter, memcg)
1634 num++; 1634 num++;
1635 return num; 1635 return num;
1636 } 1636 }
1637 1637
1638 /* 1638 /*
1639 * Return the memory (and swap, if configured) limit for a memcg. 1639 * Return the memory (and swap, if configured) limit for a memcg.
1640 */ 1640 */
1641 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) 1641 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1642 { 1642 {
1643 unsigned long limit; 1643 unsigned long limit;
1644 1644
1645 limit = memcg->memory.limit; 1645 limit = memcg->memory.limit;
1646 if (mem_cgroup_swappiness(memcg)) { 1646 if (mem_cgroup_swappiness(memcg)) {
1647 unsigned long memsw_limit; 1647 unsigned long memsw_limit;
1648 1648
1649 memsw_limit = memcg->memsw.limit; 1649 memsw_limit = memcg->memsw.limit;
1650 limit = min(limit + total_swap_pages, memsw_limit); 1650 limit = min(limit + total_swap_pages, memsw_limit);
1651 } 1651 }
1652 return limit; 1652 return limit;
1653 } 1653 }
1654 1654
1655 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1655 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1656 int order) 1656 int order)
1657 { 1657 {
1658 struct mem_cgroup *iter; 1658 struct mem_cgroup *iter;
1659 unsigned long chosen_points = 0; 1659 unsigned long chosen_points = 0;
1660 unsigned long totalpages; 1660 unsigned long totalpages;
1661 unsigned int points = 0; 1661 unsigned int points = 0;
1662 struct task_struct *chosen = NULL; 1662 struct task_struct *chosen = NULL;
1663 1663
1664 /* 1664 /*
1665 * If current has a pending SIGKILL or is exiting, then automatically 1665 * If current has a pending SIGKILL or is exiting, then automatically
1666 * select it. The goal is to allow it to allocate so that it may 1666 * select it. The goal is to allow it to allocate so that it may
1667 * quickly exit and free its memory. 1667 * quickly exit and free its memory.
1668 */ 1668 */
1669 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1669 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1670 set_thread_flag(TIF_MEMDIE); 1670 set_thread_flag(TIF_MEMDIE);
1671 return; 1671 return;
1672 } 1672 }
1673 1673
1674 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1674 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1675 totalpages = mem_cgroup_get_limit(memcg) ? : 1; 1675 totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1676 for_each_mem_cgroup_tree(iter, memcg) { 1676 for_each_mem_cgroup_tree(iter, memcg) {
1677 struct css_task_iter it; 1677 struct css_task_iter it;
1678 struct task_struct *task; 1678 struct task_struct *task;
1679 1679
1680 css_task_iter_start(&iter->css, &it); 1680 css_task_iter_start(&iter->css, &it);
1681 while ((task = css_task_iter_next(&it))) { 1681 while ((task = css_task_iter_next(&it))) {
1682 switch (oom_scan_process_thread(task, totalpages, NULL, 1682 switch (oom_scan_process_thread(task, totalpages, NULL,
1683 false)) { 1683 false)) {
1684 case OOM_SCAN_SELECT: 1684 case OOM_SCAN_SELECT:
1685 if (chosen) 1685 if (chosen)
1686 put_task_struct(chosen); 1686 put_task_struct(chosen);
1687 chosen = task; 1687 chosen = task;
1688 chosen_points = ULONG_MAX; 1688 chosen_points = ULONG_MAX;
1689 get_task_struct(chosen); 1689 get_task_struct(chosen);
1690 /* fall through */ 1690 /* fall through */
1691 case OOM_SCAN_CONTINUE: 1691 case OOM_SCAN_CONTINUE:
1692 continue; 1692 continue;
1693 case OOM_SCAN_ABORT: 1693 case OOM_SCAN_ABORT:
1694 css_task_iter_end(&it); 1694 css_task_iter_end(&it);
1695 mem_cgroup_iter_break(memcg, iter); 1695 mem_cgroup_iter_break(memcg, iter);
1696 if (chosen) 1696 if (chosen)
1697 put_task_struct(chosen); 1697 put_task_struct(chosen);
1698 return; 1698 return;
1699 case OOM_SCAN_OK: 1699 case OOM_SCAN_OK:
1700 break; 1700 break;
1701 }; 1701 };
1702 points = oom_badness(task, memcg, NULL, totalpages); 1702 points = oom_badness(task, memcg, NULL, totalpages);
1703 if (!points || points < chosen_points) 1703 if (!points || points < chosen_points)
1704 continue; 1704 continue;
1705 /* Prefer thread group leaders for display purposes */ 1705 /* Prefer thread group leaders for display purposes */
1706 if (points == chosen_points && 1706 if (points == chosen_points &&
1707 thread_group_leader(chosen)) 1707 thread_group_leader(chosen))
1708 continue; 1708 continue;
1709 1709
1710 if (chosen) 1710 if (chosen)
1711 put_task_struct(chosen); 1711 put_task_struct(chosen);
1712 chosen = task; 1712 chosen = task;
1713 chosen_points = points; 1713 chosen_points = points;
1714 get_task_struct(chosen); 1714 get_task_struct(chosen);
1715 } 1715 }
1716 css_task_iter_end(&it); 1716 css_task_iter_end(&it);
1717 } 1717 }
1718 1718
1719 if (!chosen) 1719 if (!chosen)
1720 return; 1720 return;
1721 points = chosen_points * 1000 / totalpages; 1721 points = chosen_points * 1000 / totalpages;
1722 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1722 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1723 NULL, "Memory cgroup out of memory"); 1723 NULL, "Memory cgroup out of memory");
1724 } 1724 }
1725 1725
1726 /** 1726 /**
1727 * test_mem_cgroup_node_reclaimable 1727 * test_mem_cgroup_node_reclaimable
1728 * @memcg: the target memcg 1728 * @memcg: the target memcg
1729 * @nid: the node ID to be checked. 1729 * @nid: the node ID to be checked.
1730 * @noswap : specify true here if the user wants flle only information. 1730 * @noswap : specify true here if the user wants flle only information.
1731 * 1731 *
1732 * This function returns whether the specified memcg contains any 1732 * This function returns whether the specified memcg contains any
1733 * reclaimable pages on a node. Returns true if there are any reclaimable 1733 * reclaimable pages on a node. Returns true if there are any reclaimable
1734 * pages in the node. 1734 * pages in the node.
1735 */ 1735 */
1736 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1736 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1737 int nid, bool noswap) 1737 int nid, bool noswap)
1738 { 1738 {
1739 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1739 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1740 return true; 1740 return true;
1741 if (noswap || !total_swap_pages) 1741 if (noswap || !total_swap_pages)
1742 return false; 1742 return false;
1743 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1743 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1744 return true; 1744 return true;
1745 return false; 1745 return false;
1746 1746
1747 } 1747 }
1748 #if MAX_NUMNODES > 1 1748 #if MAX_NUMNODES > 1
1749 1749
1750 /* 1750 /*
1751 * Always updating the nodemask is not very good - even if we have an empty 1751 * Always updating the nodemask is not very good - even if we have an empty
1752 * list or the wrong list here, we can start from some node and traverse all 1752 * list or the wrong list here, we can start from some node and traverse all
1753 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1753 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1754 * 1754 *
1755 */ 1755 */
1756 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1756 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1757 { 1757 {
1758 int nid; 1758 int nid;
1759 /* 1759 /*
1760 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1760 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1761 * pagein/pageout changes since the last update. 1761 * pagein/pageout changes since the last update.
1762 */ 1762 */
1763 if (!atomic_read(&memcg->numainfo_events)) 1763 if (!atomic_read(&memcg->numainfo_events))
1764 return; 1764 return;
1765 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1765 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1766 return; 1766 return;
1767 1767
1768 /* make a nodemask where this memcg uses memory from */ 1768 /* make a nodemask where this memcg uses memory from */
1769 memcg->scan_nodes = node_states[N_MEMORY]; 1769 memcg->scan_nodes = node_states[N_MEMORY];
1770 1770
1771 for_each_node_mask(nid, node_states[N_MEMORY]) { 1771 for_each_node_mask(nid, node_states[N_MEMORY]) {
1772 1772
1773 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1773 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1774 node_clear(nid, memcg->scan_nodes); 1774 node_clear(nid, memcg->scan_nodes);
1775 } 1775 }
1776 1776
1777 atomic_set(&memcg->numainfo_events, 0); 1777 atomic_set(&memcg->numainfo_events, 0);
1778 atomic_set(&memcg->numainfo_updating, 0); 1778 atomic_set(&memcg->numainfo_updating, 0);
1779 } 1779 }
1780 1780
1781 /* 1781 /*
1782 * Selecting a node where we start reclaim from. Because what we need is just 1782 * Selecting a node where we start reclaim from. Because what we need is just
1783 * reducing usage counter, start from anywhere is O,K. Considering 1783 * reducing usage counter, start from anywhere is O,K. Considering
1784 * memory reclaim from current node, there are pros. and cons. 1784 * memory reclaim from current node, there are pros. and cons.
1785 * 1785 *
1786 * Freeing memory from current node means freeing memory from a node which 1786 * Freeing memory from current node means freeing memory from a node which
1787 * we'll use or we've used. So, it may make LRU bad. And if several threads 1787 * we'll use or we've used. So, it may make LRU bad. And if several threads
1788 * hit limits, it will see a contention on a node. But freeing from remote 1788 * hit limits, it will see a contention on a node. But freeing from remote
1789 * node means more costs for memory reclaim because of memory latency. 1789 * node means more costs for memory reclaim because of memory latency.
1790 * 1790 *
1791 * Now, we use round-robin. Better algorithm is welcomed. 1791 * Now, we use round-robin. Better algorithm is welcomed.
1792 */ 1792 */
1793 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1793 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1794 { 1794 {
1795 int node; 1795 int node;
1796 1796
1797 mem_cgroup_may_update_nodemask(memcg); 1797 mem_cgroup_may_update_nodemask(memcg);
1798 node = memcg->last_scanned_node; 1798 node = memcg->last_scanned_node;
1799 1799
1800 node = next_node(node, memcg->scan_nodes); 1800 node = next_node(node, memcg->scan_nodes);
1801 if (node == MAX_NUMNODES) 1801 if (node == MAX_NUMNODES)
1802 node = first_node(memcg->scan_nodes); 1802 node = first_node(memcg->scan_nodes);
1803 /* 1803 /*
1804 * We call this when we hit limit, not when pages are added to LRU. 1804 * We call this when we hit limit, not when pages are added to LRU.
1805 * No LRU may hold pages because all pages are UNEVICTABLE or 1805 * No LRU may hold pages because all pages are UNEVICTABLE or
1806 * memcg is too small and all pages are not on LRU. In that case, 1806 * memcg is too small and all pages are not on LRU. In that case,
1807 * we use curret node. 1807 * we use curret node.
1808 */ 1808 */
1809 if (unlikely(node == MAX_NUMNODES)) 1809 if (unlikely(node == MAX_NUMNODES))
1810 node = numa_node_id(); 1810 node = numa_node_id();
1811 1811
1812 memcg->last_scanned_node = node; 1812 memcg->last_scanned_node = node;
1813 return node; 1813 return node;
1814 } 1814 }
1815 1815
1816 /* 1816 /*
1817 * Check all nodes whether it contains reclaimable pages or not. 1817 * Check all nodes whether it contains reclaimable pages or not.
1818 * For quick scan, we make use of scan_nodes. This will allow us to skip 1818 * For quick scan, we make use of scan_nodes. This will allow us to skip
1819 * unused nodes. But scan_nodes is lazily updated and may not cotain 1819 * unused nodes. But scan_nodes is lazily updated and may not cotain
1820 * enough new information. We need to do double check. 1820 * enough new information. We need to do double check.
1821 */ 1821 */
1822 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1822 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1823 { 1823 {
1824 int nid; 1824 int nid;
1825 1825
1826 /* 1826 /*
1827 * quick check...making use of scan_node. 1827 * quick check...making use of scan_node.
1828 * We can skip unused nodes. 1828 * We can skip unused nodes.
1829 */ 1829 */
1830 if (!nodes_empty(memcg->scan_nodes)) { 1830 if (!nodes_empty(memcg->scan_nodes)) {
1831 for (nid = first_node(memcg->scan_nodes); 1831 for (nid = first_node(memcg->scan_nodes);
1832 nid < MAX_NUMNODES; 1832 nid < MAX_NUMNODES;
1833 nid = next_node(nid, memcg->scan_nodes)) { 1833 nid = next_node(nid, memcg->scan_nodes)) {
1834 1834
1835 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1835 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1836 return true; 1836 return true;
1837 } 1837 }
1838 } 1838 }
1839 /* 1839 /*
1840 * Check rest of nodes. 1840 * Check rest of nodes.
1841 */ 1841 */
1842 for_each_node_state(nid, N_MEMORY) { 1842 for_each_node_state(nid, N_MEMORY) {
1843 if (node_isset(nid, memcg->scan_nodes)) 1843 if (node_isset(nid, memcg->scan_nodes))
1844 continue; 1844 continue;
1845 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1845 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1846 return true; 1846 return true;
1847 } 1847 }
1848 return false; 1848 return false;
1849 } 1849 }
1850 1850
1851 #else 1851 #else
1852 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1852 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1853 { 1853 {
1854 return 0; 1854 return 0;
1855 } 1855 }
1856 1856
1857 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1857 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1858 { 1858 {
1859 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1859 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1860 } 1860 }
1861 #endif 1861 #endif
1862 1862
1863 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 1863 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1864 struct zone *zone, 1864 struct zone *zone,
1865 gfp_t gfp_mask, 1865 gfp_t gfp_mask,
1866 unsigned long *total_scanned) 1866 unsigned long *total_scanned)
1867 { 1867 {
1868 struct mem_cgroup *victim = NULL; 1868 struct mem_cgroup *victim = NULL;
1869 int total = 0; 1869 int total = 0;
1870 int loop = 0; 1870 int loop = 0;
1871 unsigned long excess; 1871 unsigned long excess;
1872 unsigned long nr_scanned; 1872 unsigned long nr_scanned;
1873 struct mem_cgroup_reclaim_cookie reclaim = { 1873 struct mem_cgroup_reclaim_cookie reclaim = {
1874 .zone = zone, 1874 .zone = zone,
1875 .priority = 0, 1875 .priority = 0,
1876 }; 1876 };
1877 1877
1878 excess = soft_limit_excess(root_memcg); 1878 excess = soft_limit_excess(root_memcg);
1879 1879
1880 while (1) { 1880 while (1) {
1881 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 1881 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1882 if (!victim) { 1882 if (!victim) {
1883 loop++; 1883 loop++;
1884 if (loop >= 2) { 1884 if (loop >= 2) {
1885 /* 1885 /*
1886 * If we have not been able to reclaim 1886 * If we have not been able to reclaim
1887 * anything, it might because there are 1887 * anything, it might because there are
1888 * no reclaimable pages under this hierarchy 1888 * no reclaimable pages under this hierarchy
1889 */ 1889 */
1890 if (!total) 1890 if (!total)
1891 break; 1891 break;
1892 /* 1892 /*
1893 * We want to do more targeted reclaim. 1893 * We want to do more targeted reclaim.
1894 * excess >> 2 is not to excessive so as to 1894 * excess >> 2 is not to excessive so as to
1895 * reclaim too much, nor too less that we keep 1895 * reclaim too much, nor too less that we keep
1896 * coming back to reclaim from this cgroup 1896 * coming back to reclaim from this cgroup
1897 */ 1897 */
1898 if (total >= (excess >> 2) || 1898 if (total >= (excess >> 2) ||
1899 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 1899 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1900 break; 1900 break;
1901 } 1901 }
1902 continue; 1902 continue;
1903 } 1903 }
1904 if (!mem_cgroup_reclaimable(victim, false)) 1904 if (!mem_cgroup_reclaimable(victim, false))
1905 continue; 1905 continue;
1906 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 1906 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1907 zone, &nr_scanned); 1907 zone, &nr_scanned);
1908 *total_scanned += nr_scanned; 1908 *total_scanned += nr_scanned;
1909 if (!soft_limit_excess(root_memcg)) 1909 if (!soft_limit_excess(root_memcg))
1910 break; 1910 break;
1911 } 1911 }
1912 mem_cgroup_iter_break(root_memcg, victim); 1912 mem_cgroup_iter_break(root_memcg, victim);
1913 return total; 1913 return total;
1914 } 1914 }
1915 1915
1916 #ifdef CONFIG_LOCKDEP 1916 #ifdef CONFIG_LOCKDEP
1917 static struct lockdep_map memcg_oom_lock_dep_map = { 1917 static struct lockdep_map memcg_oom_lock_dep_map = {
1918 .name = "memcg_oom_lock", 1918 .name = "memcg_oom_lock",
1919 }; 1919 };
1920 #endif 1920 #endif
1921 1921
1922 static DEFINE_SPINLOCK(memcg_oom_lock); 1922 static DEFINE_SPINLOCK(memcg_oom_lock);
1923 1923
1924 /* 1924 /*
1925 * Check OOM-Killer is already running under our hierarchy. 1925 * Check OOM-Killer is already running under our hierarchy.
1926 * If someone is running, return false. 1926 * If someone is running, return false.
1927 */ 1927 */
1928 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 1928 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1929 { 1929 {
1930 struct mem_cgroup *iter, *failed = NULL; 1930 struct mem_cgroup *iter, *failed = NULL;
1931 1931
1932 spin_lock(&memcg_oom_lock); 1932 spin_lock(&memcg_oom_lock);
1933 1933
1934 for_each_mem_cgroup_tree(iter, memcg) { 1934 for_each_mem_cgroup_tree(iter, memcg) {
1935 if (iter->oom_lock) { 1935 if (iter->oom_lock) {
1936 /* 1936 /*
1937 * this subtree of our hierarchy is already locked 1937 * this subtree of our hierarchy is already locked
1938 * so we cannot give a lock. 1938 * so we cannot give a lock.
1939 */ 1939 */
1940 failed = iter; 1940 failed = iter;
1941 mem_cgroup_iter_break(memcg, iter); 1941 mem_cgroup_iter_break(memcg, iter);
1942 break; 1942 break;
1943 } else 1943 } else
1944 iter->oom_lock = true; 1944 iter->oom_lock = true;
1945 } 1945 }
1946 1946
1947 if (failed) { 1947 if (failed) {
1948 /* 1948 /*
1949 * OK, we failed to lock the whole subtree so we have 1949 * OK, we failed to lock the whole subtree so we have
1950 * to clean up what we set up to the failing subtree 1950 * to clean up what we set up to the failing subtree
1951 */ 1951 */
1952 for_each_mem_cgroup_tree(iter, memcg) { 1952 for_each_mem_cgroup_tree(iter, memcg) {
1953 if (iter == failed) { 1953 if (iter == failed) {
1954 mem_cgroup_iter_break(memcg, iter); 1954 mem_cgroup_iter_break(memcg, iter);
1955 break; 1955 break;
1956 } 1956 }
1957 iter->oom_lock = false; 1957 iter->oom_lock = false;
1958 } 1958 }
1959 } else 1959 } else
1960 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 1960 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1961 1961
1962 spin_unlock(&memcg_oom_lock); 1962 spin_unlock(&memcg_oom_lock);
1963 1963
1964 return !failed; 1964 return !failed;
1965 } 1965 }
1966 1966
1967 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 1967 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1968 { 1968 {
1969 struct mem_cgroup *iter; 1969 struct mem_cgroup *iter;
1970 1970
1971 spin_lock(&memcg_oom_lock); 1971 spin_lock(&memcg_oom_lock);
1972 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 1972 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1973 for_each_mem_cgroup_tree(iter, memcg) 1973 for_each_mem_cgroup_tree(iter, memcg)
1974 iter->oom_lock = false; 1974 iter->oom_lock = false;
1975 spin_unlock(&memcg_oom_lock); 1975 spin_unlock(&memcg_oom_lock);
1976 } 1976 }
1977 1977
1978 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 1978 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1979 { 1979 {
1980 struct mem_cgroup *iter; 1980 struct mem_cgroup *iter;
1981 1981
1982 for_each_mem_cgroup_tree(iter, memcg) 1982 for_each_mem_cgroup_tree(iter, memcg)
1983 atomic_inc(&iter->under_oom); 1983 atomic_inc(&iter->under_oom);
1984 } 1984 }
1985 1985
1986 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 1986 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1987 { 1987 {
1988 struct mem_cgroup *iter; 1988 struct mem_cgroup *iter;
1989 1989
1990 /* 1990 /*
1991 * When a new child is created while the hierarchy is under oom, 1991 * When a new child is created while the hierarchy is under oom,
1992 * mem_cgroup_oom_lock() may not be called. We have to use 1992 * mem_cgroup_oom_lock() may not be called. We have to use
1993 * atomic_add_unless() here. 1993 * atomic_add_unless() here.
1994 */ 1994 */
1995 for_each_mem_cgroup_tree(iter, memcg) 1995 for_each_mem_cgroup_tree(iter, memcg)
1996 atomic_add_unless(&iter->under_oom, -1, 0); 1996 atomic_add_unless(&iter->under_oom, -1, 0);
1997 } 1997 }
1998 1998
1999 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1999 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2000 2000
2001 struct oom_wait_info { 2001 struct oom_wait_info {
2002 struct mem_cgroup *memcg; 2002 struct mem_cgroup *memcg;
2003 wait_queue_t wait; 2003 wait_queue_t wait;
2004 }; 2004 };
2005 2005
2006 static int memcg_oom_wake_function(wait_queue_t *wait, 2006 static int memcg_oom_wake_function(wait_queue_t *wait,
2007 unsigned mode, int sync, void *arg) 2007 unsigned mode, int sync, void *arg)
2008 { 2008 {
2009 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2009 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2010 struct mem_cgroup *oom_wait_memcg; 2010 struct mem_cgroup *oom_wait_memcg;
2011 struct oom_wait_info *oom_wait_info; 2011 struct oom_wait_info *oom_wait_info;
2012 2012
2013 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2013 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2014 oom_wait_memcg = oom_wait_info->memcg; 2014 oom_wait_memcg = oom_wait_info->memcg;
2015 2015
2016 /* 2016 /*
2017 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2017 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
2018 * Then we can use css_is_ancestor without taking care of RCU. 2018 * Then we can use css_is_ancestor without taking care of RCU.
2019 */ 2019 */
2020 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2020 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2021 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2021 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2022 return 0; 2022 return 0;
2023 return autoremove_wake_function(wait, mode, sync, arg); 2023 return autoremove_wake_function(wait, mode, sync, arg);
2024 } 2024 }
2025 2025
2026 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2026 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2027 { 2027 {
2028 atomic_inc(&memcg->oom_wakeups); 2028 atomic_inc(&memcg->oom_wakeups);
2029 /* for filtering, pass "memcg" as argument. */ 2029 /* for filtering, pass "memcg" as argument. */
2030 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2030 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2031 } 2031 }
2032 2032
2033 static void memcg_oom_recover(struct mem_cgroup *memcg) 2033 static void memcg_oom_recover(struct mem_cgroup *memcg)
2034 { 2034 {
2035 if (memcg && atomic_read(&memcg->under_oom)) 2035 if (memcg && atomic_read(&memcg->under_oom))
2036 memcg_wakeup_oom(memcg); 2036 memcg_wakeup_oom(memcg);
2037 } 2037 }
2038 2038
2039 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2039 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2040 { 2040 {
2041 if (!current->memcg_oom.may_oom) 2041 if (!current->memcg_oom.may_oom)
2042 return; 2042 return;
2043 /* 2043 /*
2044 * We are in the middle of the charge context here, so we 2044 * We are in the middle of the charge context here, so we
2045 * don't want to block when potentially sitting on a callstack 2045 * don't want to block when potentially sitting on a callstack
2046 * that holds all kinds of filesystem and mm locks. 2046 * that holds all kinds of filesystem and mm locks.
2047 * 2047 *
2048 * Also, the caller may handle a failed allocation gracefully 2048 * Also, the caller may handle a failed allocation gracefully
2049 * (like optional page cache readahead) and so an OOM killer 2049 * (like optional page cache readahead) and so an OOM killer
2050 * invocation might not even be necessary. 2050 * invocation might not even be necessary.
2051 * 2051 *
2052 * That's why we don't do anything here except remember the 2052 * That's why we don't do anything here except remember the
2053 * OOM context and then deal with it at the end of the page 2053 * OOM context and then deal with it at the end of the page
2054 * fault when the stack is unwound, the locks are released, 2054 * fault when the stack is unwound, the locks are released,
2055 * and when we know whether the fault was overall successful. 2055 * and when we know whether the fault was overall successful.
2056 */ 2056 */
2057 css_get(&memcg->css); 2057 css_get(&memcg->css);
2058 current->memcg_oom.memcg = memcg; 2058 current->memcg_oom.memcg = memcg;
2059 current->memcg_oom.gfp_mask = mask; 2059 current->memcg_oom.gfp_mask = mask;
2060 current->memcg_oom.order = order; 2060 current->memcg_oom.order = order;
2061 } 2061 }
2062 2062
2063 /** 2063 /**
2064 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2064 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2065 * @handle: actually kill/wait or just clean up the OOM state 2065 * @handle: actually kill/wait or just clean up the OOM state
2066 * 2066 *
2067 * This has to be called at the end of a page fault if the memcg OOM 2067 * This has to be called at the end of a page fault if the memcg OOM
2068 * handler was enabled. 2068 * handler was enabled.
2069 * 2069 *
2070 * Memcg supports userspace OOM handling where failed allocations must 2070 * Memcg supports userspace OOM handling where failed allocations must
2071 * sleep on a waitqueue until the userspace task resolves the 2071 * sleep on a waitqueue until the userspace task resolves the
2072 * situation. Sleeping directly in the charge context with all kinds 2072 * situation. Sleeping directly in the charge context with all kinds
2073 * of locks held is not a good idea, instead we remember an OOM state 2073 * of locks held is not a good idea, instead we remember an OOM state
2074 * in the task and mem_cgroup_oom_synchronize() has to be called at 2074 * in the task and mem_cgroup_oom_synchronize() has to be called at
2075 * the end of the page fault to complete the OOM handling. 2075 * the end of the page fault to complete the OOM handling.
2076 * 2076 *
2077 * Returns %true if an ongoing memcg OOM situation was detected and 2077 * Returns %true if an ongoing memcg OOM situation was detected and
2078 * completed, %false otherwise. 2078 * completed, %false otherwise.
2079 */ 2079 */
2080 bool mem_cgroup_oom_synchronize(bool handle) 2080 bool mem_cgroup_oom_synchronize(bool handle)
2081 { 2081 {
2082 struct mem_cgroup *memcg = current->memcg_oom.memcg; 2082 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2083 struct oom_wait_info owait; 2083 struct oom_wait_info owait;
2084 bool locked; 2084 bool locked;
2085 2085
2086 /* OOM is global, do not handle */ 2086 /* OOM is global, do not handle */
2087 if (!memcg) 2087 if (!memcg)
2088 return false; 2088 return false;
2089 2089
2090 if (!handle) 2090 if (!handle)
2091 goto cleanup; 2091 goto cleanup;
2092 2092
2093 owait.memcg = memcg; 2093 owait.memcg = memcg;
2094 owait.wait.flags = 0; 2094 owait.wait.flags = 0;
2095 owait.wait.func = memcg_oom_wake_function; 2095 owait.wait.func = memcg_oom_wake_function;
2096 owait.wait.private = current; 2096 owait.wait.private = current;
2097 INIT_LIST_HEAD(&owait.wait.task_list); 2097 INIT_LIST_HEAD(&owait.wait.task_list);
2098 2098
2099 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2099 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2100 mem_cgroup_mark_under_oom(memcg); 2100 mem_cgroup_mark_under_oom(memcg);
2101 2101
2102 locked = mem_cgroup_oom_trylock(memcg); 2102 locked = mem_cgroup_oom_trylock(memcg);
2103 2103
2104 if (locked) 2104 if (locked)
2105 mem_cgroup_oom_notify(memcg); 2105 mem_cgroup_oom_notify(memcg);
2106 2106
2107 if (locked && !memcg->oom_kill_disable) { 2107 if (locked && !memcg->oom_kill_disable) {
2108 mem_cgroup_unmark_under_oom(memcg); 2108 mem_cgroup_unmark_under_oom(memcg);
2109 finish_wait(&memcg_oom_waitq, &owait.wait); 2109 finish_wait(&memcg_oom_waitq, &owait.wait);
2110 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2110 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2111 current->memcg_oom.order); 2111 current->memcg_oom.order);
2112 } else { 2112 } else {
2113 schedule(); 2113 schedule();
2114 mem_cgroup_unmark_under_oom(memcg); 2114 mem_cgroup_unmark_under_oom(memcg);
2115 finish_wait(&memcg_oom_waitq, &owait.wait); 2115 finish_wait(&memcg_oom_waitq, &owait.wait);
2116 } 2116 }
2117 2117
2118 if (locked) { 2118 if (locked) {
2119 mem_cgroup_oom_unlock(memcg); 2119 mem_cgroup_oom_unlock(memcg);
2120 /* 2120 /*
2121 * There is no guarantee that an OOM-lock contender 2121 * There is no guarantee that an OOM-lock contender
2122 * sees the wakeups triggered by the OOM kill 2122 * sees the wakeups triggered by the OOM kill
2123 * uncharges. Wake any sleepers explicitely. 2123 * uncharges. Wake any sleepers explicitely.
2124 */ 2124 */
2125 memcg_oom_recover(memcg); 2125 memcg_oom_recover(memcg);
2126 } 2126 }
2127 cleanup: 2127 cleanup:
2128 current->memcg_oom.memcg = NULL; 2128 current->memcg_oom.memcg = NULL;
2129 css_put(&memcg->css); 2129 css_put(&memcg->css);
2130 return true; 2130 return true;
2131 } 2131 }
2132 2132
2133 /** 2133 /**
2134 * mem_cgroup_begin_page_stat - begin a page state statistics transaction 2134 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
2135 * @page: page that is going to change accounted state 2135 * @page: page that is going to change accounted state
2136 * @locked: &memcg->move_lock slowpath was taken 2136 * @locked: &memcg->move_lock slowpath was taken
2137 * @flags: IRQ-state flags for &memcg->move_lock 2137 * @flags: IRQ-state flags for &memcg->move_lock
2138 * 2138 *
2139 * This function must mark the beginning of an accounted page state 2139 * This function must mark the beginning of an accounted page state
2140 * change to prevent double accounting when the page is concurrently 2140 * change to prevent double accounting when the page is concurrently
2141 * being moved to another memcg: 2141 * being moved to another memcg:
2142 * 2142 *
2143 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); 2143 * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
2144 * if (TestClearPageState(page)) 2144 * if (TestClearPageState(page))
2145 * mem_cgroup_update_page_stat(memcg, state, -1); 2145 * mem_cgroup_update_page_stat(memcg, state, -1);
2146 * mem_cgroup_end_page_stat(memcg, locked, flags); 2146 * mem_cgroup_end_page_stat(memcg, locked, flags);
2147 * 2147 *
2148 * The RCU lock is held throughout the transaction. The fast path can 2148 * The RCU lock is held throughout the transaction. The fast path can
2149 * get away without acquiring the memcg->move_lock (@locked is false) 2149 * get away without acquiring the memcg->move_lock (@locked is false)
2150 * because page moving starts with an RCU grace period. 2150 * because page moving starts with an RCU grace period.
2151 * 2151 *
2152 * The RCU lock also protects the memcg from being freed when the page 2152 * The RCU lock also protects the memcg from being freed when the page
2153 * state that is going to change is the only thing preventing the page 2153 * state that is going to change is the only thing preventing the page
2154 * from being uncharged. E.g. end-writeback clearing PageWriteback(), 2154 * from being uncharged. E.g. end-writeback clearing PageWriteback(),
2155 * which allows migration to go ahead and uncharge the page before the 2155 * which allows migration to go ahead and uncharge the page before the
2156 * account transaction might be complete. 2156 * account transaction might be complete.
2157 */ 2157 */
2158 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, 2158 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
2159 bool *locked, 2159 bool *locked,
2160 unsigned long *flags) 2160 unsigned long *flags)
2161 { 2161 {
2162 struct mem_cgroup *memcg; 2162 struct mem_cgroup *memcg;
2163 struct page_cgroup *pc; 2163 struct page_cgroup *pc;
2164 2164
2165 rcu_read_lock(); 2165 rcu_read_lock();
2166 2166
2167 if (mem_cgroup_disabled()) 2167 if (mem_cgroup_disabled())
2168 return NULL; 2168 return NULL;
2169 2169
2170 pc = lookup_page_cgroup(page); 2170 pc = lookup_page_cgroup(page);
2171 again: 2171 again:
2172 memcg = pc->mem_cgroup; 2172 memcg = pc->mem_cgroup;
2173 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2173 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2174 return NULL; 2174 return NULL;
2175 2175
2176 *locked = false; 2176 *locked = false;
2177 if (atomic_read(&memcg->moving_account) <= 0) 2177 if (atomic_read(&memcg->moving_account) <= 0)
2178 return memcg; 2178 return memcg;
2179 2179
2180 move_lock_mem_cgroup(memcg, flags); 2180 move_lock_mem_cgroup(memcg, flags);
2181 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2181 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2182 move_unlock_mem_cgroup(memcg, flags); 2182 move_unlock_mem_cgroup(memcg, flags);
2183 goto again; 2183 goto again;
2184 } 2184 }
2185 *locked = true; 2185 *locked = true;
2186 2186
2187 return memcg; 2187 return memcg;
2188 } 2188 }
2189 2189
2190 /** 2190 /**
2191 * mem_cgroup_end_page_stat - finish a page state statistics transaction 2191 * mem_cgroup_end_page_stat - finish a page state statistics transaction
2192 * @memcg: the memcg that was accounted against 2192 * @memcg: the memcg that was accounted against
2193 * @locked: value received from mem_cgroup_begin_page_stat() 2193 * @locked: value received from mem_cgroup_begin_page_stat()
2194 * @flags: value received from mem_cgroup_begin_page_stat() 2194 * @flags: value received from mem_cgroup_begin_page_stat()
2195 */ 2195 */
2196 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, 2196 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
2197 unsigned long flags) 2197 unsigned long flags)
2198 { 2198 {
2199 if (memcg && locked) 2199 if (memcg && locked)
2200 move_unlock_mem_cgroup(memcg, &flags); 2200 move_unlock_mem_cgroup(memcg, &flags);
2201 2201
2202 rcu_read_unlock(); 2202 rcu_read_unlock();
2203 } 2203 }
2204 2204
2205 /** 2205 /**
2206 * mem_cgroup_update_page_stat - update page state statistics 2206 * mem_cgroup_update_page_stat - update page state statistics
2207 * @memcg: memcg to account against 2207 * @memcg: memcg to account against
2208 * @idx: page state item to account 2208 * @idx: page state item to account
2209 * @val: number of pages (positive or negative) 2209 * @val: number of pages (positive or negative)
2210 * 2210 *
2211 * See mem_cgroup_begin_page_stat() for locking requirements. 2211 * See mem_cgroup_begin_page_stat() for locking requirements.
2212 */ 2212 */
2213 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, 2213 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
2214 enum mem_cgroup_stat_index idx, int val) 2214 enum mem_cgroup_stat_index idx, int val)
2215 { 2215 {
2216 VM_BUG_ON(!rcu_read_lock_held()); 2216 VM_BUG_ON(!rcu_read_lock_held());
2217 2217
2218 if (memcg) 2218 if (memcg)
2219 this_cpu_add(memcg->stat->count[idx], val); 2219 this_cpu_add(memcg->stat->count[idx], val);
2220 } 2220 }
2221 2221
2222 /* 2222 /*
2223 * size of first charge trial. "32" comes from vmscan.c's magic value. 2223 * size of first charge trial. "32" comes from vmscan.c's magic value.
2224 * TODO: maybe necessary to use big numbers in big irons. 2224 * TODO: maybe necessary to use big numbers in big irons.
2225 */ 2225 */
2226 #define CHARGE_BATCH 32U 2226 #define CHARGE_BATCH 32U
2227 struct memcg_stock_pcp { 2227 struct memcg_stock_pcp {
2228 struct mem_cgroup *cached; /* this never be root cgroup */ 2228 struct mem_cgroup *cached; /* this never be root cgroup */
2229 unsigned int nr_pages; 2229 unsigned int nr_pages;
2230 struct work_struct work; 2230 struct work_struct work;
2231 unsigned long flags; 2231 unsigned long flags;
2232 #define FLUSHING_CACHED_CHARGE 0 2232 #define FLUSHING_CACHED_CHARGE 0
2233 }; 2233 };
2234 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2234 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2235 static DEFINE_MUTEX(percpu_charge_mutex); 2235 static DEFINE_MUTEX(percpu_charge_mutex);
2236 2236
2237 /** 2237 /**
2238 * consume_stock: Try to consume stocked charge on this cpu. 2238 * consume_stock: Try to consume stocked charge on this cpu.
2239 * @memcg: memcg to consume from. 2239 * @memcg: memcg to consume from.
2240 * @nr_pages: how many pages to charge. 2240 * @nr_pages: how many pages to charge.
2241 * 2241 *
2242 * The charges will only happen if @memcg matches the current cpu's memcg 2242 * The charges will only happen if @memcg matches the current cpu's memcg
2243 * stock, and at least @nr_pages are available in that stock. Failure to 2243 * stock, and at least @nr_pages are available in that stock. Failure to
2244 * service an allocation will refill the stock. 2244 * service an allocation will refill the stock.
2245 * 2245 *
2246 * returns true if successful, false otherwise. 2246 * returns true if successful, false otherwise.
2247 */ 2247 */
2248 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2248 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2249 { 2249 {
2250 struct memcg_stock_pcp *stock; 2250 struct memcg_stock_pcp *stock;
2251 bool ret = false; 2251 bool ret = false;
2252 2252
2253 if (nr_pages > CHARGE_BATCH) 2253 if (nr_pages > CHARGE_BATCH)
2254 return ret; 2254 return ret;
2255 2255
2256 stock = &get_cpu_var(memcg_stock); 2256 stock = &get_cpu_var(memcg_stock);
2257 if (memcg == stock->cached && stock->nr_pages >= nr_pages) { 2257 if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2258 stock->nr_pages -= nr_pages; 2258 stock->nr_pages -= nr_pages;
2259 ret = true; 2259 ret = true;
2260 } 2260 }
2261 put_cpu_var(memcg_stock); 2261 put_cpu_var(memcg_stock);
2262 return ret; 2262 return ret;
2263 } 2263 }
2264 2264
2265 /* 2265 /*
2266 * Returns stocks cached in percpu and reset cached information. 2266 * Returns stocks cached in percpu and reset cached information.
2267 */ 2267 */
2268 static void drain_stock(struct memcg_stock_pcp *stock) 2268 static void drain_stock(struct memcg_stock_pcp *stock)
2269 { 2269 {
2270 struct mem_cgroup *old = stock->cached; 2270 struct mem_cgroup *old = stock->cached;
2271 2271
2272 if (stock->nr_pages) { 2272 if (stock->nr_pages) {
2273 page_counter_uncharge(&old->memory, stock->nr_pages); 2273 page_counter_uncharge(&old->memory, stock->nr_pages);
2274 if (do_swap_account) 2274 if (do_swap_account)
2275 page_counter_uncharge(&old->memsw, stock->nr_pages); 2275 page_counter_uncharge(&old->memsw, stock->nr_pages);
2276 css_put_many(&old->css, stock->nr_pages);
2276 stock->nr_pages = 0; 2277 stock->nr_pages = 0;
2277 } 2278 }
2278 stock->cached = NULL; 2279 stock->cached = NULL;
2279 } 2280 }
2280 2281
2281 /* 2282 /*
2282 * This must be called under preempt disabled or must be called by 2283 * This must be called under preempt disabled or must be called by
2283 * a thread which is pinned to local cpu. 2284 * a thread which is pinned to local cpu.
2284 */ 2285 */
2285 static void drain_local_stock(struct work_struct *dummy) 2286 static void drain_local_stock(struct work_struct *dummy)
2286 { 2287 {
2287 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); 2288 struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
2288 drain_stock(stock); 2289 drain_stock(stock);
2289 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2290 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2290 } 2291 }
2291 2292
2292 static void __init memcg_stock_init(void) 2293 static void __init memcg_stock_init(void)
2293 { 2294 {
2294 int cpu; 2295 int cpu;
2295 2296
2296 for_each_possible_cpu(cpu) { 2297 for_each_possible_cpu(cpu) {
2297 struct memcg_stock_pcp *stock = 2298 struct memcg_stock_pcp *stock =
2298 &per_cpu(memcg_stock, cpu); 2299 &per_cpu(memcg_stock, cpu);
2299 INIT_WORK(&stock->work, drain_local_stock); 2300 INIT_WORK(&stock->work, drain_local_stock);
2300 } 2301 }
2301 } 2302 }
2302 2303
2303 /* 2304 /*
2304 * Cache charges(val) to local per_cpu area. 2305 * Cache charges(val) to local per_cpu area.
2305 * This will be consumed by consume_stock() function, later. 2306 * This will be consumed by consume_stock() function, later.
2306 */ 2307 */
2307 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2308 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2308 { 2309 {
2309 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2310 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2310 2311
2311 if (stock->cached != memcg) { /* reset if necessary */ 2312 if (stock->cached != memcg) { /* reset if necessary */
2312 drain_stock(stock); 2313 drain_stock(stock);
2313 stock->cached = memcg; 2314 stock->cached = memcg;
2314 } 2315 }
2315 stock->nr_pages += nr_pages; 2316 stock->nr_pages += nr_pages;
2316 put_cpu_var(memcg_stock); 2317 put_cpu_var(memcg_stock);
2317 } 2318 }
2318 2319
2319 /* 2320 /*
2320 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2321 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2321 * of the hierarchy under it. sync flag says whether we should block 2322 * of the hierarchy under it. sync flag says whether we should block
2322 * until the work is done. 2323 * until the work is done.
2323 */ 2324 */
2324 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2325 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2325 { 2326 {
2326 int cpu, curcpu; 2327 int cpu, curcpu;
2327 2328
2328 /* Notify other cpus that system-wide "drain" is running */ 2329 /* Notify other cpus that system-wide "drain" is running */
2329 get_online_cpus(); 2330 get_online_cpus();
2330 curcpu = get_cpu(); 2331 curcpu = get_cpu();
2331 for_each_online_cpu(cpu) { 2332 for_each_online_cpu(cpu) {
2332 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2333 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2333 struct mem_cgroup *memcg; 2334 struct mem_cgroup *memcg;
2334 2335
2335 memcg = stock->cached; 2336 memcg = stock->cached;
2336 if (!memcg || !stock->nr_pages) 2337 if (!memcg || !stock->nr_pages)
2337 continue; 2338 continue;
2338 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2339 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2339 continue; 2340 continue;
2340 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2341 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2341 if (cpu == curcpu) 2342 if (cpu == curcpu)
2342 drain_local_stock(&stock->work); 2343 drain_local_stock(&stock->work);
2343 else 2344 else
2344 schedule_work_on(cpu, &stock->work); 2345 schedule_work_on(cpu, &stock->work);
2345 } 2346 }
2346 } 2347 }
2347 put_cpu(); 2348 put_cpu();
2348 2349
2349 if (!sync) 2350 if (!sync)
2350 goto out; 2351 goto out;
2351 2352
2352 for_each_online_cpu(cpu) { 2353 for_each_online_cpu(cpu) {
2353 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2354 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2354 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2355 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2355 flush_work(&stock->work); 2356 flush_work(&stock->work);
2356 } 2357 }
2357 out: 2358 out:
2358 put_online_cpus(); 2359 put_online_cpus();
2359 } 2360 }
2360 2361
2361 /* 2362 /*
2362 * Tries to drain stocked charges in other cpus. This function is asynchronous 2363 * Tries to drain stocked charges in other cpus. This function is asynchronous
2363 * and just put a work per cpu for draining localy on each cpu. Caller can 2364 * and just put a work per cpu for draining localy on each cpu. Caller can
2364 * expects some charges will be back later but cannot wait for it. 2365 * expects some charges will be back later but cannot wait for it.
2365 */ 2366 */
2366 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2367 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2367 { 2368 {
2368 /* 2369 /*
2369 * If someone calls draining, avoid adding more kworker runs. 2370 * If someone calls draining, avoid adding more kworker runs.
2370 */ 2371 */
2371 if (!mutex_trylock(&percpu_charge_mutex)) 2372 if (!mutex_trylock(&percpu_charge_mutex))
2372 return; 2373 return;
2373 drain_all_stock(root_memcg, false); 2374 drain_all_stock(root_memcg, false);
2374 mutex_unlock(&percpu_charge_mutex); 2375 mutex_unlock(&percpu_charge_mutex);
2375 } 2376 }
2376 2377
2377 /* This is a synchronous drain interface. */ 2378 /* This is a synchronous drain interface. */
2378 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2379 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2379 { 2380 {
2380 /* called when force_empty is called */ 2381 /* called when force_empty is called */
2381 mutex_lock(&percpu_charge_mutex); 2382 mutex_lock(&percpu_charge_mutex);
2382 drain_all_stock(root_memcg, true); 2383 drain_all_stock(root_memcg, true);
2383 mutex_unlock(&percpu_charge_mutex); 2384 mutex_unlock(&percpu_charge_mutex);
2384 } 2385 }
2385 2386
2386 /* 2387 /*
2387 * This function drains percpu counter value from DEAD cpu and 2388 * This function drains percpu counter value from DEAD cpu and
2388 * move it to local cpu. Note that this function can be preempted. 2389 * move it to local cpu. Note that this function can be preempted.
2389 */ 2390 */
2390 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2391 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2391 { 2392 {
2392 int i; 2393 int i;
2393 2394
2394 spin_lock(&memcg->pcp_counter_lock); 2395 spin_lock(&memcg->pcp_counter_lock);
2395 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2396 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2396 long x = per_cpu(memcg->stat->count[i], cpu); 2397 long x = per_cpu(memcg->stat->count[i], cpu);
2397 2398
2398 per_cpu(memcg->stat->count[i], cpu) = 0; 2399 per_cpu(memcg->stat->count[i], cpu) = 0;
2399 memcg->nocpu_base.count[i] += x; 2400 memcg->nocpu_base.count[i] += x;
2400 } 2401 }
2401 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2402 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2402 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2403 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2403 2404
2404 per_cpu(memcg->stat->events[i], cpu) = 0; 2405 per_cpu(memcg->stat->events[i], cpu) = 0;
2405 memcg->nocpu_base.events[i] += x; 2406 memcg->nocpu_base.events[i] += x;
2406 } 2407 }
2407 spin_unlock(&memcg->pcp_counter_lock); 2408 spin_unlock(&memcg->pcp_counter_lock);
2408 } 2409 }
2409 2410
2410 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2411 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2411 unsigned long action, 2412 unsigned long action,
2412 void *hcpu) 2413 void *hcpu)
2413 { 2414 {
2414 int cpu = (unsigned long)hcpu; 2415 int cpu = (unsigned long)hcpu;
2415 struct memcg_stock_pcp *stock; 2416 struct memcg_stock_pcp *stock;
2416 struct mem_cgroup *iter; 2417 struct mem_cgroup *iter;
2417 2418
2418 if (action == CPU_ONLINE) 2419 if (action == CPU_ONLINE)
2419 return NOTIFY_OK; 2420 return NOTIFY_OK;
2420 2421
2421 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2422 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2422 return NOTIFY_OK; 2423 return NOTIFY_OK;
2423 2424
2424 for_each_mem_cgroup(iter) 2425 for_each_mem_cgroup(iter)
2425 mem_cgroup_drain_pcp_counter(iter, cpu); 2426 mem_cgroup_drain_pcp_counter(iter, cpu);
2426 2427
2427 stock = &per_cpu(memcg_stock, cpu); 2428 stock = &per_cpu(memcg_stock, cpu);
2428 drain_stock(stock); 2429 drain_stock(stock);
2429 return NOTIFY_OK; 2430 return NOTIFY_OK;
2430 } 2431 }
2431 2432
2432 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2433 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2433 unsigned int nr_pages) 2434 unsigned int nr_pages)
2434 { 2435 {
2435 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2436 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2436 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 2437 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2437 struct mem_cgroup *mem_over_limit; 2438 struct mem_cgroup *mem_over_limit;
2438 struct page_counter *counter; 2439 struct page_counter *counter;
2439 unsigned long nr_reclaimed; 2440 unsigned long nr_reclaimed;
2440 bool may_swap = true; 2441 bool may_swap = true;
2441 bool drained = false; 2442 bool drained = false;
2442 int ret = 0; 2443 int ret = 0;
2443 2444
2444 if (mem_cgroup_is_root(memcg)) 2445 if (mem_cgroup_is_root(memcg))
2445 goto done; 2446 goto done;
2446 retry: 2447 retry:
2447 if (consume_stock(memcg, nr_pages)) 2448 if (consume_stock(memcg, nr_pages))
2448 goto done; 2449 goto done;
2449 2450
2450 if (!do_swap_account || 2451 if (!do_swap_account ||
2451 !page_counter_try_charge(&memcg->memsw, batch, &counter)) { 2452 !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2452 if (!page_counter_try_charge(&memcg->memory, batch, &counter)) 2453 if (!page_counter_try_charge(&memcg->memory, batch, &counter))
2453 goto done_restock; 2454 goto done_restock;
2454 if (do_swap_account) 2455 if (do_swap_account)
2455 page_counter_uncharge(&memcg->memsw, batch); 2456 page_counter_uncharge(&memcg->memsw, batch);
2456 mem_over_limit = mem_cgroup_from_counter(counter, memory); 2457 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2457 } else { 2458 } else {
2458 mem_over_limit = mem_cgroup_from_counter(counter, memsw); 2459 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2459 may_swap = false; 2460 may_swap = false;
2460 } 2461 }
2461 2462
2462 if (batch > nr_pages) { 2463 if (batch > nr_pages) {
2463 batch = nr_pages; 2464 batch = nr_pages;
2464 goto retry; 2465 goto retry;
2465 } 2466 }
2466 2467
2467 /* 2468 /*
2468 * Unlike in global OOM situations, memcg is not in a physical 2469 * Unlike in global OOM situations, memcg is not in a physical
2469 * memory shortage. Allow dying and OOM-killed tasks to 2470 * memory shortage. Allow dying and OOM-killed tasks to
2470 * bypass the last charges so that they can exit quickly and 2471 * bypass the last charges so that they can exit quickly and
2471 * free their memory. 2472 * free their memory.
2472 */ 2473 */
2473 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2474 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2474 fatal_signal_pending(current) || 2475 fatal_signal_pending(current) ||
2475 current->flags & PF_EXITING)) 2476 current->flags & PF_EXITING))
2476 goto bypass; 2477 goto bypass;
2477 2478
2478 if (unlikely(task_in_memcg_oom(current))) 2479 if (unlikely(task_in_memcg_oom(current)))
2479 goto nomem; 2480 goto nomem;
2480 2481
2481 if (!(gfp_mask & __GFP_WAIT)) 2482 if (!(gfp_mask & __GFP_WAIT))
2482 goto nomem; 2483 goto nomem;
2483 2484
2484 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, 2485 nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2485 gfp_mask, may_swap); 2486 gfp_mask, may_swap);
2486 2487
2487 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2488 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2488 goto retry; 2489 goto retry;
2489 2490
2490 if (!drained) { 2491 if (!drained) {
2491 drain_all_stock_async(mem_over_limit); 2492 drain_all_stock_async(mem_over_limit);
2492 drained = true; 2493 drained = true;
2493 goto retry; 2494 goto retry;
2494 } 2495 }
2495 2496
2496 if (gfp_mask & __GFP_NORETRY) 2497 if (gfp_mask & __GFP_NORETRY)
2497 goto nomem; 2498 goto nomem;
2498 /* 2499 /*
2499 * Even though the limit is exceeded at this point, reclaim 2500 * Even though the limit is exceeded at this point, reclaim
2500 * may have been able to free some pages. Retry the charge 2501 * may have been able to free some pages. Retry the charge
2501 * before killing the task. 2502 * before killing the task.
2502 * 2503 *
2503 * Only for regular pages, though: huge pages are rather 2504 * Only for regular pages, though: huge pages are rather
2504 * unlikely to succeed so close to the limit, and we fall back 2505 * unlikely to succeed so close to the limit, and we fall back
2505 * to regular pages anyway in case of failure. 2506 * to regular pages anyway in case of failure.
2506 */ 2507 */
2507 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) 2508 if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2508 goto retry; 2509 goto retry;
2509 /* 2510 /*
2510 * At task move, charge accounts can be doubly counted. So, it's 2511 * At task move, charge accounts can be doubly counted. So, it's
2511 * better to wait until the end of task_move if something is going on. 2512 * better to wait until the end of task_move if something is going on.
2512 */ 2513 */
2513 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2514 if (mem_cgroup_wait_acct_move(mem_over_limit))
2514 goto retry; 2515 goto retry;
2515 2516
2516 if (nr_retries--) 2517 if (nr_retries--)
2517 goto retry; 2518 goto retry;
2518 2519
2519 if (gfp_mask & __GFP_NOFAIL) 2520 if (gfp_mask & __GFP_NOFAIL)
2520 goto bypass; 2521 goto bypass;
2521 2522
2522 if (fatal_signal_pending(current)) 2523 if (fatal_signal_pending(current))
2523 goto bypass; 2524 goto bypass;
2524 2525
2525 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); 2526 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
2526 nomem: 2527 nomem:
2527 if (!(gfp_mask & __GFP_NOFAIL)) 2528 if (!(gfp_mask & __GFP_NOFAIL))
2528 return -ENOMEM; 2529 return -ENOMEM;
2529 bypass: 2530 bypass:
2530 return -EINTR; 2531 return -EINTR;
2531 2532
2532 done_restock: 2533 done_restock:
2534 css_get_many(&memcg->css, batch);
2533 if (batch > nr_pages) 2535 if (batch > nr_pages)
2534 refill_stock(memcg, batch - nr_pages); 2536 refill_stock(memcg, batch - nr_pages);
2535 done: 2537 done:
2536 return ret; 2538 return ret;
2537 } 2539 }
2538 2540
2539 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) 2541 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2540 { 2542 {
2541 if (mem_cgroup_is_root(memcg)) 2543 if (mem_cgroup_is_root(memcg))
2542 return; 2544 return;
2543 2545
2544 page_counter_uncharge(&memcg->memory, nr_pages); 2546 page_counter_uncharge(&memcg->memory, nr_pages);
2545 if (do_swap_account) 2547 if (do_swap_account)
2546 page_counter_uncharge(&memcg->memsw, nr_pages); 2548 page_counter_uncharge(&memcg->memsw, nr_pages);
2549
2550 css_put_many(&memcg->css, nr_pages);
2547 } 2551 }
2548 2552
2549 /* 2553 /*
2550 * A helper function to get mem_cgroup from ID. must be called under 2554 * A helper function to get mem_cgroup from ID. must be called under
2551 * rcu_read_lock(). The caller is responsible for calling 2555 * rcu_read_lock(). The caller is responsible for calling
2552 * css_tryget_online() if the mem_cgroup is used for charging. (dropping 2556 * css_tryget_online() if the mem_cgroup is used for charging. (dropping
2553 * refcnt from swap can be called against removed memcg.) 2557 * refcnt from swap can be called against removed memcg.)
2554 */ 2558 */
2555 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2559 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2556 { 2560 {
2557 /* ID 0 is unused ID */ 2561 /* ID 0 is unused ID */
2558 if (!id) 2562 if (!id)
2559 return NULL; 2563 return NULL;
2560 return mem_cgroup_from_id(id); 2564 return mem_cgroup_from_id(id);
2561 } 2565 }
2562 2566
2563 /* 2567 /*
2564 * try_get_mem_cgroup_from_page - look up page's memcg association 2568 * try_get_mem_cgroup_from_page - look up page's memcg association
2565 * @page: the page 2569 * @page: the page
2566 * 2570 *
2567 * Look up, get a css reference, and return the memcg that owns @page. 2571 * Look up, get a css reference, and return the memcg that owns @page.
2568 * 2572 *
2569 * The page must be locked to prevent racing with swap-in and page 2573 * The page must be locked to prevent racing with swap-in and page
2570 * cache charges. If coming from an unlocked page table, the caller 2574 * cache charges. If coming from an unlocked page table, the caller
2571 * must ensure the page is on the LRU or this can race with charging. 2575 * must ensure the page is on the LRU or this can race with charging.
2572 */ 2576 */
2573 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2577 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2574 { 2578 {
2575 struct mem_cgroup *memcg = NULL; 2579 struct mem_cgroup *memcg = NULL;
2576 struct page_cgroup *pc; 2580 struct page_cgroup *pc;
2577 unsigned short id; 2581 unsigned short id;
2578 swp_entry_t ent; 2582 swp_entry_t ent;
2579 2583
2580 VM_BUG_ON_PAGE(!PageLocked(page), page); 2584 VM_BUG_ON_PAGE(!PageLocked(page), page);
2581 2585
2582 pc = lookup_page_cgroup(page); 2586 pc = lookup_page_cgroup(page);
2583 if (PageCgroupUsed(pc)) { 2587 if (PageCgroupUsed(pc)) {
2584 memcg = pc->mem_cgroup; 2588 memcg = pc->mem_cgroup;
2585 if (memcg && !css_tryget_online(&memcg->css)) 2589 if (memcg && !css_tryget_online(&memcg->css))
2586 memcg = NULL; 2590 memcg = NULL;
2587 } else if (PageSwapCache(page)) { 2591 } else if (PageSwapCache(page)) {
2588 ent.val = page_private(page); 2592 ent.val = page_private(page);
2589 id = lookup_swap_cgroup_id(ent); 2593 id = lookup_swap_cgroup_id(ent);
2590 rcu_read_lock(); 2594 rcu_read_lock();
2591 memcg = mem_cgroup_lookup(id); 2595 memcg = mem_cgroup_lookup(id);
2592 if (memcg && !css_tryget_online(&memcg->css)) 2596 if (memcg && !css_tryget_online(&memcg->css))
2593 memcg = NULL; 2597 memcg = NULL;
2594 rcu_read_unlock(); 2598 rcu_read_unlock();
2595 } 2599 }
2596 return memcg; 2600 return memcg;
2597 } 2601 }
2598 2602
2599 static void lock_page_lru(struct page *page, int *isolated) 2603 static void lock_page_lru(struct page *page, int *isolated)
2600 { 2604 {
2601 struct zone *zone = page_zone(page); 2605 struct zone *zone = page_zone(page);
2602 2606
2603 spin_lock_irq(&zone->lru_lock); 2607 spin_lock_irq(&zone->lru_lock);
2604 if (PageLRU(page)) { 2608 if (PageLRU(page)) {
2605 struct lruvec *lruvec; 2609 struct lruvec *lruvec;
2606 2610
2607 lruvec = mem_cgroup_page_lruvec(page, zone); 2611 lruvec = mem_cgroup_page_lruvec(page, zone);
2608 ClearPageLRU(page); 2612 ClearPageLRU(page);
2609 del_page_from_lru_list(page, lruvec, page_lru(page)); 2613 del_page_from_lru_list(page, lruvec, page_lru(page));
2610 *isolated = 1; 2614 *isolated = 1;
2611 } else 2615 } else
2612 *isolated = 0; 2616 *isolated = 0;
2613 } 2617 }
2614 2618
2615 static void unlock_page_lru(struct page *page, int isolated) 2619 static void unlock_page_lru(struct page *page, int isolated)
2616 { 2620 {
2617 struct zone *zone = page_zone(page); 2621 struct zone *zone = page_zone(page);
2618 2622
2619 if (isolated) { 2623 if (isolated) {
2620 struct lruvec *lruvec; 2624 struct lruvec *lruvec;
2621 2625
2622 lruvec = mem_cgroup_page_lruvec(page, zone); 2626 lruvec = mem_cgroup_page_lruvec(page, zone);
2623 VM_BUG_ON_PAGE(PageLRU(page), page); 2627 VM_BUG_ON_PAGE(PageLRU(page), page);
2624 SetPageLRU(page); 2628 SetPageLRU(page);
2625 add_page_to_lru_list(page, lruvec, page_lru(page)); 2629 add_page_to_lru_list(page, lruvec, page_lru(page));
2626 } 2630 }
2627 spin_unlock_irq(&zone->lru_lock); 2631 spin_unlock_irq(&zone->lru_lock);
2628 } 2632 }
2629 2633
2630 static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2634 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2631 bool lrucare) 2635 bool lrucare)
2632 { 2636 {
2633 struct page_cgroup *pc = lookup_page_cgroup(page); 2637 struct page_cgroup *pc = lookup_page_cgroup(page);
2634 int isolated; 2638 int isolated;
2635 2639
2636 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2640 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2637 /* 2641 /*
2638 * we don't need page_cgroup_lock about tail pages, becase they are not 2642 * we don't need page_cgroup_lock about tail pages, becase they are not
2639 * accessed by any other context at this point. 2643 * accessed by any other context at this point.
2640 */ 2644 */
2641 2645
2642 /* 2646 /*
2643 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2647 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2644 * may already be on some other mem_cgroup's LRU. Take care of it. 2648 * may already be on some other mem_cgroup's LRU. Take care of it.
2645 */ 2649 */
2646 if (lrucare) 2650 if (lrucare)
2647 lock_page_lru(page, &isolated); 2651 lock_page_lru(page, &isolated);
2648 2652
2649 /* 2653 /*
2650 * Nobody should be changing or seriously looking at 2654 * Nobody should be changing or seriously looking at
2651 * pc->mem_cgroup and pc->flags at this point: 2655 * pc->mem_cgroup and pc->flags at this point:
2652 * 2656 *
2653 * - the page is uncharged 2657 * - the page is uncharged
2654 * 2658 *
2655 * - the page is off-LRU 2659 * - the page is off-LRU
2656 * 2660 *
2657 * - an anonymous fault has exclusive page access, except for 2661 * - an anonymous fault has exclusive page access, except for
2658 * a locked page table 2662 * a locked page table
2659 * 2663 *
2660 * - a page cache insertion, a swapin fault, or a migration 2664 * - a page cache insertion, a swapin fault, or a migration
2661 * have the page locked 2665 * have the page locked
2662 */ 2666 */
2663 pc->mem_cgroup = memcg; 2667 pc->mem_cgroup = memcg;
2664 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); 2668 pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
2665 2669
2666 if (lrucare) 2670 if (lrucare)
2667 unlock_page_lru(page, isolated); 2671 unlock_page_lru(page, isolated);
2668 } 2672 }
2669 2673
2670 #ifdef CONFIG_MEMCG_KMEM 2674 #ifdef CONFIG_MEMCG_KMEM
2671 /* 2675 /*
2672 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or 2676 * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
2673 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. 2677 * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
2674 */ 2678 */
2675 static DEFINE_MUTEX(memcg_slab_mutex); 2679 static DEFINE_MUTEX(memcg_slab_mutex);
2676 2680
2677 static DEFINE_MUTEX(activate_kmem_mutex); 2681 static DEFINE_MUTEX(activate_kmem_mutex);
2678 2682
2679 /* 2683 /*
2680 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2684 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2681 * in the memcg_cache_params struct. 2685 * in the memcg_cache_params struct.
2682 */ 2686 */
2683 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2687 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2684 { 2688 {
2685 struct kmem_cache *cachep; 2689 struct kmem_cache *cachep;
2686 2690
2687 VM_BUG_ON(p->is_root_cache); 2691 VM_BUG_ON(p->is_root_cache);
2688 cachep = p->root_cache; 2692 cachep = p->root_cache;
2689 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2693 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2690 } 2694 }
2691 2695
2692 #ifdef CONFIG_SLABINFO 2696 #ifdef CONFIG_SLABINFO
2693 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) 2697 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2694 { 2698 {
2695 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 2699 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2696 struct memcg_cache_params *params; 2700 struct memcg_cache_params *params;
2697 2701
2698 if (!memcg_kmem_is_active(memcg)) 2702 if (!memcg_kmem_is_active(memcg))
2699 return -EIO; 2703 return -EIO;
2700 2704
2701 print_slabinfo_header(m); 2705 print_slabinfo_header(m);
2702 2706
2703 mutex_lock(&memcg_slab_mutex); 2707 mutex_lock(&memcg_slab_mutex);
2704 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2708 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2705 cache_show(memcg_params_to_cache(params), m); 2709 cache_show(memcg_params_to_cache(params), m);
2706 mutex_unlock(&memcg_slab_mutex); 2710 mutex_unlock(&memcg_slab_mutex);
2707 2711
2708 return 0; 2712 return 0;
2709 } 2713 }
2710 #endif 2714 #endif
2711 2715
2712 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 2716 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2713 unsigned long nr_pages) 2717 unsigned long nr_pages)
2714 { 2718 {
2715 struct page_counter *counter; 2719 struct page_counter *counter;
2716 int ret = 0; 2720 int ret = 0;
2717 2721
2718 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); 2722 ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
2719 if (ret < 0) 2723 if (ret < 0)
2720 return ret; 2724 return ret;
2721 2725
2722 ret = try_charge(memcg, gfp, nr_pages); 2726 ret = try_charge(memcg, gfp, nr_pages);
2723 if (ret == -EINTR) { 2727 if (ret == -EINTR) {
2724 /* 2728 /*
2725 * try_charge() chose to bypass to root due to OOM kill or 2729 * try_charge() chose to bypass to root due to OOM kill or
2726 * fatal signal. Since our only options are to either fail 2730 * fatal signal. Since our only options are to either fail
2727 * the allocation or charge it to this cgroup, do it as a 2731 * the allocation or charge it to this cgroup, do it as a
2728 * temporary condition. But we can't fail. From a kmem/slab 2732 * temporary condition. But we can't fail. From a kmem/slab
2729 * perspective, the cache has already been selected, by 2733 * perspective, the cache has already been selected, by
2730 * mem_cgroup_kmem_get_cache(), so it is too late to change 2734 * mem_cgroup_kmem_get_cache(), so it is too late to change
2731 * our minds. 2735 * our minds.
2732 * 2736 *
2733 * This condition will only trigger if the task entered 2737 * This condition will only trigger if the task entered
2734 * memcg_charge_kmem in a sane state, but was OOM-killed 2738 * memcg_charge_kmem in a sane state, but was OOM-killed
2735 * during try_charge() above. Tasks that were already dying 2739 * during try_charge() above. Tasks that were already dying
2736 * when the allocation triggers should have been already 2740 * when the allocation triggers should have been already
2737 * directed to the root cgroup in memcontrol.h 2741 * directed to the root cgroup in memcontrol.h
2738 */ 2742 */
2739 page_counter_charge(&memcg->memory, nr_pages); 2743 page_counter_charge(&memcg->memory, nr_pages);
2740 if (do_swap_account) 2744 if (do_swap_account)
2741 page_counter_charge(&memcg->memsw, nr_pages); 2745 page_counter_charge(&memcg->memsw, nr_pages);
2746 css_get_many(&memcg->css, nr_pages);
2742 ret = 0; 2747 ret = 0;
2743 } else if (ret) 2748 } else if (ret)
2744 page_counter_uncharge(&memcg->kmem, nr_pages); 2749 page_counter_uncharge(&memcg->kmem, nr_pages);
2745 2750
2746 return ret; 2751 return ret;
2747 } 2752 }
2748 2753
2749 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, 2754 static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
2750 unsigned long nr_pages) 2755 unsigned long nr_pages)
2751 { 2756 {
2752 page_counter_uncharge(&memcg->memory, nr_pages); 2757 page_counter_uncharge(&memcg->memory, nr_pages);
2753 if (do_swap_account) 2758 if (do_swap_account)
2754 page_counter_uncharge(&memcg->memsw, nr_pages); 2759 page_counter_uncharge(&memcg->memsw, nr_pages);
2755 2760
2756 /* Not down to 0 */ 2761 /* Not down to 0 */
2757 if (page_counter_uncharge(&memcg->kmem, nr_pages)) 2762 if (page_counter_uncharge(&memcg->kmem, nr_pages)) {
2763 css_put_many(&memcg->css, nr_pages);
2758 return; 2764 return;
2765 }
2759 2766
2760 /* 2767 /*
2761 * Releases a reference taken in kmem_cgroup_css_offline in case 2768 * Releases a reference taken in kmem_cgroup_css_offline in case
2762 * this last uncharge is racing with the offlining code or it is 2769 * this last uncharge is racing with the offlining code or it is
2763 * outliving the memcg existence. 2770 * outliving the memcg existence.
2764 * 2771 *
2765 * The memory barrier imposed by test&clear is paired with the 2772 * The memory barrier imposed by test&clear is paired with the
2766 * explicit one in memcg_kmem_mark_dead(). 2773 * explicit one in memcg_kmem_mark_dead().
2767 */ 2774 */
2768 if (memcg_kmem_test_and_clear_dead(memcg)) 2775 if (memcg_kmem_test_and_clear_dead(memcg))
2769 css_put(&memcg->css); 2776 css_put(&memcg->css);
2777
2778 css_put_many(&memcg->css, nr_pages);
2770 } 2779 }
2771 2780
2772 /* 2781 /*
2773 * helper for acessing a memcg's index. It will be used as an index in the 2782 * helper for acessing a memcg's index. It will be used as an index in the
2774 * child cache array in kmem_cache, and also to derive its name. This function 2783 * child cache array in kmem_cache, and also to derive its name. This function
2775 * will return -1 when this is not a kmem-limited memcg. 2784 * will return -1 when this is not a kmem-limited memcg.
2776 */ 2785 */
2777 int memcg_cache_id(struct mem_cgroup *memcg) 2786 int memcg_cache_id(struct mem_cgroup *memcg)
2778 { 2787 {
2779 return memcg ? memcg->kmemcg_id : -1; 2788 return memcg ? memcg->kmemcg_id : -1;
2780 } 2789 }
2781 2790
2782 static int memcg_alloc_cache_id(void) 2791 static int memcg_alloc_cache_id(void)
2783 { 2792 {
2784 int id, size; 2793 int id, size;
2785 int err; 2794 int err;
2786 2795
2787 id = ida_simple_get(&kmem_limited_groups, 2796 id = ida_simple_get(&kmem_limited_groups,
2788 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2797 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2789 if (id < 0) 2798 if (id < 0)
2790 return id; 2799 return id;
2791 2800
2792 if (id < memcg_limited_groups_array_size) 2801 if (id < memcg_limited_groups_array_size)
2793 return id; 2802 return id;
2794 2803
2795 /* 2804 /*
2796 * There's no space for the new id in memcg_caches arrays, 2805 * There's no space for the new id in memcg_caches arrays,
2797 * so we have to grow them. 2806 * so we have to grow them.
2798 */ 2807 */
2799 2808
2800 size = 2 * (id + 1); 2809 size = 2 * (id + 1);
2801 if (size < MEMCG_CACHES_MIN_SIZE) 2810 if (size < MEMCG_CACHES_MIN_SIZE)
2802 size = MEMCG_CACHES_MIN_SIZE; 2811 size = MEMCG_CACHES_MIN_SIZE;
2803 else if (size > MEMCG_CACHES_MAX_SIZE) 2812 else if (size > MEMCG_CACHES_MAX_SIZE)
2804 size = MEMCG_CACHES_MAX_SIZE; 2813 size = MEMCG_CACHES_MAX_SIZE;
2805 2814
2806 mutex_lock(&memcg_slab_mutex); 2815 mutex_lock(&memcg_slab_mutex);
2807 err = memcg_update_all_caches(size); 2816 err = memcg_update_all_caches(size);
2808 mutex_unlock(&memcg_slab_mutex); 2817 mutex_unlock(&memcg_slab_mutex);
2809 2818
2810 if (err) { 2819 if (err) {
2811 ida_simple_remove(&kmem_limited_groups, id); 2820 ida_simple_remove(&kmem_limited_groups, id);
2812 return err; 2821 return err;
2813 } 2822 }
2814 return id; 2823 return id;
2815 } 2824 }
2816 2825
2817 static void memcg_free_cache_id(int id) 2826 static void memcg_free_cache_id(int id)
2818 { 2827 {
2819 ida_simple_remove(&kmem_limited_groups, id); 2828 ida_simple_remove(&kmem_limited_groups, id);
2820 } 2829 }
2821 2830
2822 /* 2831 /*
2823 * We should update the current array size iff all caches updates succeed. This 2832 * We should update the current array size iff all caches updates succeed. This
2824 * can only be done from the slab side. The slab mutex needs to be held when 2833 * can only be done from the slab side. The slab mutex needs to be held when
2825 * calling this. 2834 * calling this.
2826 */ 2835 */
2827 void memcg_update_array_size(int num) 2836 void memcg_update_array_size(int num)
2828 { 2837 {
2829 memcg_limited_groups_array_size = num; 2838 memcg_limited_groups_array_size = num;
2830 } 2839 }
2831 2840
2832 static void memcg_register_cache(struct mem_cgroup *memcg, 2841 static void memcg_register_cache(struct mem_cgroup *memcg,
2833 struct kmem_cache *root_cache) 2842 struct kmem_cache *root_cache)
2834 { 2843 {
2835 static char memcg_name_buf[NAME_MAX + 1]; /* protected by 2844 static char memcg_name_buf[NAME_MAX + 1]; /* protected by
2836 memcg_slab_mutex */ 2845 memcg_slab_mutex */
2837 struct kmem_cache *cachep; 2846 struct kmem_cache *cachep;
2838 int id; 2847 int id;
2839 2848
2840 lockdep_assert_held(&memcg_slab_mutex); 2849 lockdep_assert_held(&memcg_slab_mutex);
2841 2850
2842 id = memcg_cache_id(memcg); 2851 id = memcg_cache_id(memcg);
2843 2852
2844 /* 2853 /*
2845 * Since per-memcg caches are created asynchronously on first 2854 * Since per-memcg caches are created asynchronously on first
2846 * allocation (see memcg_kmem_get_cache()), several threads can try to 2855 * allocation (see memcg_kmem_get_cache()), several threads can try to
2847 * create the same cache, but only one of them may succeed. 2856 * create the same cache, but only one of them may succeed.
2848 */ 2857 */
2849 if (cache_from_memcg_idx(root_cache, id)) 2858 if (cache_from_memcg_idx(root_cache, id))
2850 return; 2859 return;
2851 2860
2852 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); 2861 cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
2853 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); 2862 cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
2854 /* 2863 /*
2855 * If we could not create a memcg cache, do not complain, because 2864 * If we could not create a memcg cache, do not complain, because
2856 * that's not critical at all as we can always proceed with the root 2865 * that's not critical at all as we can always proceed with the root
2857 * cache. 2866 * cache.
2858 */ 2867 */
2859 if (!cachep) 2868 if (!cachep)
2860 return; 2869 return;
2861 2870
2862 css_get(&memcg->css); 2871 css_get(&memcg->css);
2863 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 2872 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
2864 2873
2865 /* 2874 /*
2866 * Since readers won't lock (see cache_from_memcg_idx()), we need a 2875 * Since readers won't lock (see cache_from_memcg_idx()), we need a
2867 * barrier here to ensure nobody will see the kmem_cache partially 2876 * barrier here to ensure nobody will see the kmem_cache partially
2868 * initialized. 2877 * initialized.
2869 */ 2878 */
2870 smp_wmb(); 2879 smp_wmb();
2871 2880
2872 BUG_ON(root_cache->memcg_params->memcg_caches[id]); 2881 BUG_ON(root_cache->memcg_params->memcg_caches[id]);
2873 root_cache->memcg_params->memcg_caches[id] = cachep; 2882 root_cache->memcg_params->memcg_caches[id] = cachep;
2874 } 2883 }
2875 2884
2876 static void memcg_unregister_cache(struct kmem_cache *cachep) 2885 static void memcg_unregister_cache(struct kmem_cache *cachep)
2877 { 2886 {
2878 struct kmem_cache *root_cache; 2887 struct kmem_cache *root_cache;
2879 struct mem_cgroup *memcg; 2888 struct mem_cgroup *memcg;
2880 int id; 2889 int id;
2881 2890
2882 lockdep_assert_held(&memcg_slab_mutex); 2891 lockdep_assert_held(&memcg_slab_mutex);
2883 2892
2884 BUG_ON(is_root_cache(cachep)); 2893 BUG_ON(is_root_cache(cachep));
2885 2894
2886 root_cache = cachep->memcg_params->root_cache; 2895 root_cache = cachep->memcg_params->root_cache;
2887 memcg = cachep->memcg_params->memcg; 2896 memcg = cachep->memcg_params->memcg;
2888 id = memcg_cache_id(memcg); 2897 id = memcg_cache_id(memcg);
2889 2898
2890 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); 2899 BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
2891 root_cache->memcg_params->memcg_caches[id] = NULL; 2900 root_cache->memcg_params->memcg_caches[id] = NULL;
2892 2901
2893 list_del(&cachep->memcg_params->list); 2902 list_del(&cachep->memcg_params->list);
2894 2903
2895 kmem_cache_destroy(cachep); 2904 kmem_cache_destroy(cachep);
2896 2905
2897 /* drop the reference taken in memcg_register_cache */ 2906 /* drop the reference taken in memcg_register_cache */
2898 css_put(&memcg->css); 2907 css_put(&memcg->css);
2899 } 2908 }
2900 2909
2901 /* 2910 /*
2902 * During the creation a new cache, we need to disable our accounting mechanism 2911 * During the creation a new cache, we need to disable our accounting mechanism
2903 * altogether. This is true even if we are not creating, but rather just 2912 * altogether. This is true even if we are not creating, but rather just
2904 * enqueing new caches to be created. 2913 * enqueing new caches to be created.
2905 * 2914 *
2906 * This is because that process will trigger allocations; some visible, like 2915 * This is because that process will trigger allocations; some visible, like
2907 * explicit kmallocs to auxiliary data structures, name strings and internal 2916 * explicit kmallocs to auxiliary data structures, name strings and internal
2908 * cache structures; some well concealed, like INIT_WORK() that can allocate 2917 * cache structures; some well concealed, like INIT_WORK() that can allocate
2909 * objects during debug. 2918 * objects during debug.
2910 * 2919 *
2911 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 2920 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
2912 * to it. This may not be a bounded recursion: since the first cache creation 2921 * to it. This may not be a bounded recursion: since the first cache creation
2913 * failed to complete (waiting on the allocation), we'll just try to create the 2922 * failed to complete (waiting on the allocation), we'll just try to create the
2914 * cache again, failing at the same point. 2923 * cache again, failing at the same point.
2915 * 2924 *
2916 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 2925 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
2917 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 2926 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
2918 * inside the following two functions. 2927 * inside the following two functions.
2919 */ 2928 */
2920 static inline void memcg_stop_kmem_account(void) 2929 static inline void memcg_stop_kmem_account(void)
2921 { 2930 {
2922 VM_BUG_ON(!current->mm); 2931 VM_BUG_ON(!current->mm);
2923 current->memcg_kmem_skip_account++; 2932 current->memcg_kmem_skip_account++;
2924 } 2933 }
2925 2934
2926 static inline void memcg_resume_kmem_account(void) 2935 static inline void memcg_resume_kmem_account(void)
2927 { 2936 {
2928 VM_BUG_ON(!current->mm); 2937 VM_BUG_ON(!current->mm);
2929 current->memcg_kmem_skip_account--; 2938 current->memcg_kmem_skip_account--;
2930 } 2939 }
2931 2940
2932 int __memcg_cleanup_cache_params(struct kmem_cache *s) 2941 int __memcg_cleanup_cache_params(struct kmem_cache *s)
2933 { 2942 {
2934 struct kmem_cache *c; 2943 struct kmem_cache *c;
2935 int i, failed = 0; 2944 int i, failed = 0;
2936 2945
2937 mutex_lock(&memcg_slab_mutex); 2946 mutex_lock(&memcg_slab_mutex);
2938 for_each_memcg_cache_index(i) { 2947 for_each_memcg_cache_index(i) {
2939 c = cache_from_memcg_idx(s, i); 2948 c = cache_from_memcg_idx(s, i);
2940 if (!c) 2949 if (!c)
2941 continue; 2950 continue;
2942 2951
2943 memcg_unregister_cache(c); 2952 memcg_unregister_cache(c);
2944 2953
2945 if (cache_from_memcg_idx(s, i)) 2954 if (cache_from_memcg_idx(s, i))
2946 failed++; 2955 failed++;
2947 } 2956 }
2948 mutex_unlock(&memcg_slab_mutex); 2957 mutex_unlock(&memcg_slab_mutex);
2949 return failed; 2958 return failed;
2950 } 2959 }
2951 2960
2952 static void memcg_unregister_all_caches(struct mem_cgroup *memcg) 2961 static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
2953 { 2962 {
2954 struct kmem_cache *cachep; 2963 struct kmem_cache *cachep;
2955 struct memcg_cache_params *params, *tmp; 2964 struct memcg_cache_params *params, *tmp;
2956 2965
2957 if (!memcg_kmem_is_active(memcg)) 2966 if (!memcg_kmem_is_active(memcg))
2958 return; 2967 return;
2959 2968
2960 mutex_lock(&memcg_slab_mutex); 2969 mutex_lock(&memcg_slab_mutex);
2961 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { 2970 list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
2962 cachep = memcg_params_to_cache(params); 2971 cachep = memcg_params_to_cache(params);
2963 kmem_cache_shrink(cachep); 2972 kmem_cache_shrink(cachep);
2964 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 2973 if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
2965 memcg_unregister_cache(cachep); 2974 memcg_unregister_cache(cachep);
2966 } 2975 }
2967 mutex_unlock(&memcg_slab_mutex); 2976 mutex_unlock(&memcg_slab_mutex);
2968 } 2977 }
2969 2978
2970 struct memcg_register_cache_work { 2979 struct memcg_register_cache_work {
2971 struct mem_cgroup *memcg; 2980 struct mem_cgroup *memcg;
2972 struct kmem_cache *cachep; 2981 struct kmem_cache *cachep;
2973 struct work_struct work; 2982 struct work_struct work;
2974 }; 2983 };
2975 2984
2976 static void memcg_register_cache_func(struct work_struct *w) 2985 static void memcg_register_cache_func(struct work_struct *w)
2977 { 2986 {
2978 struct memcg_register_cache_work *cw = 2987 struct memcg_register_cache_work *cw =
2979 container_of(w, struct memcg_register_cache_work, work); 2988 container_of(w, struct memcg_register_cache_work, work);
2980 struct mem_cgroup *memcg = cw->memcg; 2989 struct mem_cgroup *memcg = cw->memcg;
2981 struct kmem_cache *cachep = cw->cachep; 2990 struct kmem_cache *cachep = cw->cachep;
2982 2991
2983 mutex_lock(&memcg_slab_mutex); 2992 mutex_lock(&memcg_slab_mutex);
2984 memcg_register_cache(memcg, cachep); 2993 memcg_register_cache(memcg, cachep);
2985 mutex_unlock(&memcg_slab_mutex); 2994 mutex_unlock(&memcg_slab_mutex);
2986 2995
2987 css_put(&memcg->css); 2996 css_put(&memcg->css);
2988 kfree(cw); 2997 kfree(cw);
2989 } 2998 }
2990 2999
2991 /* 3000 /*
2992 * Enqueue the creation of a per-memcg kmem_cache. 3001 * Enqueue the creation of a per-memcg kmem_cache.
2993 */ 3002 */
2994 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, 3003 static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
2995 struct kmem_cache *cachep) 3004 struct kmem_cache *cachep)
2996 { 3005 {
2997 struct memcg_register_cache_work *cw; 3006 struct memcg_register_cache_work *cw;
2998 3007
2999 cw = kmalloc(sizeof(*cw), GFP_NOWAIT); 3008 cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
3000 if (cw == NULL) { 3009 if (cw == NULL) {
3001 css_put(&memcg->css); 3010 css_put(&memcg->css);
3002 return; 3011 return;
3003 } 3012 }
3004 3013
3005 cw->memcg = memcg; 3014 cw->memcg = memcg;
3006 cw->cachep = cachep; 3015 cw->cachep = cachep;
3007 3016
3008 INIT_WORK(&cw->work, memcg_register_cache_func); 3017 INIT_WORK(&cw->work, memcg_register_cache_func);
3009 schedule_work(&cw->work); 3018 schedule_work(&cw->work);
3010 } 3019 }
3011 3020
3012 static void memcg_schedule_register_cache(struct mem_cgroup *memcg, 3021 static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
3013 struct kmem_cache *cachep) 3022 struct kmem_cache *cachep)
3014 { 3023 {
3015 /* 3024 /*
3016 * We need to stop accounting when we kmalloc, because if the 3025 * We need to stop accounting when we kmalloc, because if the
3017 * corresponding kmalloc cache is not yet created, the first allocation 3026 * corresponding kmalloc cache is not yet created, the first allocation
3018 * in __memcg_schedule_register_cache will recurse. 3027 * in __memcg_schedule_register_cache will recurse.
3019 * 3028 *
3020 * However, it is better to enclose the whole function. Depending on 3029 * However, it is better to enclose the whole function. Depending on
3021 * the debugging options enabled, INIT_WORK(), for instance, can 3030 * the debugging options enabled, INIT_WORK(), for instance, can
3022 * trigger an allocation. This too, will make us recurse. Because at 3031 * trigger an allocation. This too, will make us recurse. Because at
3023 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3032 * this point we can't allow ourselves back into memcg_kmem_get_cache,
3024 * the safest choice is to do it like this, wrapping the whole function. 3033 * the safest choice is to do it like this, wrapping the whole function.
3025 */ 3034 */
3026 memcg_stop_kmem_account(); 3035 memcg_stop_kmem_account();
3027 __memcg_schedule_register_cache(memcg, cachep); 3036 __memcg_schedule_register_cache(memcg, cachep);
3028 memcg_resume_kmem_account(); 3037 memcg_resume_kmem_account();
3029 } 3038 }
3030 3039
3031 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) 3040 int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
3032 { 3041 {
3033 unsigned int nr_pages = 1 << order; 3042 unsigned int nr_pages = 1 << order;
3034 int res; 3043 int res;
3035 3044
3036 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); 3045 res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
3037 if (!res) 3046 if (!res)
3038 atomic_add(nr_pages, &cachep->memcg_params->nr_pages); 3047 atomic_add(nr_pages, &cachep->memcg_params->nr_pages);
3039 return res; 3048 return res;
3040 } 3049 }
3041 3050
3042 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) 3051 void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
3043 { 3052 {
3044 unsigned int nr_pages = 1 << order; 3053 unsigned int nr_pages = 1 << order;
3045 3054
3046 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); 3055 memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
3047 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); 3056 atomic_sub(nr_pages, &cachep->memcg_params->nr_pages);
3048 } 3057 }
3049 3058
3050 /* 3059 /*
3051 * Return the kmem_cache we're supposed to use for a slab allocation. 3060 * Return the kmem_cache we're supposed to use for a slab allocation.
3052 * We try to use the current memcg's version of the cache. 3061 * We try to use the current memcg's version of the cache.
3053 * 3062 *
3054 * If the cache does not exist yet, if we are the first user of it, 3063 * If the cache does not exist yet, if we are the first user of it,
3055 * we either create it immediately, if possible, or create it asynchronously 3064 * we either create it immediately, if possible, or create it asynchronously
3056 * in a workqueue. 3065 * in a workqueue.
3057 * In the latter case, we will let the current allocation go through with 3066 * In the latter case, we will let the current allocation go through with
3058 * the original cache. 3067 * the original cache.
3059 * 3068 *
3060 * Can't be called in interrupt context or from kernel threads. 3069 * Can't be called in interrupt context or from kernel threads.
3061 * This function needs to be called with rcu_read_lock() held. 3070 * This function needs to be called with rcu_read_lock() held.
3062 */ 3071 */
3063 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3072 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3064 gfp_t gfp) 3073 gfp_t gfp)
3065 { 3074 {
3066 struct mem_cgroup *memcg; 3075 struct mem_cgroup *memcg;
3067 struct kmem_cache *memcg_cachep; 3076 struct kmem_cache *memcg_cachep;
3068 3077
3069 VM_BUG_ON(!cachep->memcg_params); 3078 VM_BUG_ON(!cachep->memcg_params);
3070 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3079 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3071 3080
3072 if (!current->mm || current->memcg_kmem_skip_account) 3081 if (!current->mm || current->memcg_kmem_skip_account)
3073 return cachep; 3082 return cachep;
3074 3083
3075 rcu_read_lock(); 3084 rcu_read_lock();
3076 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3085 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3077 3086
3078 if (!memcg_kmem_is_active(memcg)) 3087 if (!memcg_kmem_is_active(memcg))
3079 goto out; 3088 goto out;
3080 3089
3081 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 3090 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3082 if (likely(memcg_cachep)) { 3091 if (likely(memcg_cachep)) {
3083 cachep = memcg_cachep; 3092 cachep = memcg_cachep;
3084 goto out; 3093 goto out;
3085 } 3094 }
3086 3095
3087 /* The corresponding put will be done in the workqueue. */ 3096 /* The corresponding put will be done in the workqueue. */
3088 if (!css_tryget_online(&memcg->css)) 3097 if (!css_tryget_online(&memcg->css))
3089 goto out; 3098 goto out;
3090 rcu_read_unlock(); 3099 rcu_read_unlock();
3091 3100
3092 /* 3101 /*
3093 * If we are in a safe context (can wait, and not in interrupt 3102 * If we are in a safe context (can wait, and not in interrupt
3094 * context), we could be be predictable and return right away. 3103 * context), we could be be predictable and return right away.
3095 * This would guarantee that the allocation being performed 3104 * This would guarantee that the allocation being performed
3096 * already belongs in the new cache. 3105 * already belongs in the new cache.
3097 * 3106 *
3098 * However, there are some clashes that can arrive from locking. 3107 * However, there are some clashes that can arrive from locking.
3099 * For instance, because we acquire the slab_mutex while doing 3108 * For instance, because we acquire the slab_mutex while doing
3100 * memcg_create_kmem_cache, this means no further allocation 3109 * memcg_create_kmem_cache, this means no further allocation
3101 * could happen with the slab_mutex held. So it's better to 3110 * could happen with the slab_mutex held. So it's better to
3102 * defer everything. 3111 * defer everything.
3103 */ 3112 */
3104 memcg_schedule_register_cache(memcg, cachep); 3113 memcg_schedule_register_cache(memcg, cachep);
3105 return cachep; 3114 return cachep;
3106 out: 3115 out:
3107 rcu_read_unlock(); 3116 rcu_read_unlock();
3108 return cachep; 3117 return cachep;
3109 } 3118 }
3110 3119
3111 /* 3120 /*
3112 * We need to verify if the allocation against current->mm->owner's memcg is 3121 * We need to verify if the allocation against current->mm->owner's memcg is
3113 * possible for the given order. But the page is not allocated yet, so we'll 3122 * possible for the given order. But the page is not allocated yet, so we'll
3114 * need a further commit step to do the final arrangements. 3123 * need a further commit step to do the final arrangements.
3115 * 3124 *
3116 * It is possible for the task to switch cgroups in this mean time, so at 3125 * It is possible for the task to switch cgroups in this mean time, so at
3117 * commit time, we can't rely on task conversion any longer. We'll then use 3126 * commit time, we can't rely on task conversion any longer. We'll then use
3118 * the handle argument to return to the caller which cgroup we should commit 3127 * the handle argument to return to the caller which cgroup we should commit
3119 * against. We could also return the memcg directly and avoid the pointer 3128 * against. We could also return the memcg directly and avoid the pointer
3120 * passing, but a boolean return value gives better semantics considering 3129 * passing, but a boolean return value gives better semantics considering
3121 * the compiled-out case as well. 3130 * the compiled-out case as well.
3122 * 3131 *
3123 * Returning true means the allocation is possible. 3132 * Returning true means the allocation is possible.
3124 */ 3133 */
3125 bool 3134 bool
3126 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3135 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3127 { 3136 {
3128 struct mem_cgroup *memcg; 3137 struct mem_cgroup *memcg;
3129 int ret; 3138 int ret;
3130 3139
3131 *_memcg = NULL; 3140 *_memcg = NULL;
3132 3141
3133 /* 3142 /*
3134 * Disabling accounting is only relevant for some specific memcg 3143 * Disabling accounting is only relevant for some specific memcg
3135 * internal allocations. Therefore we would initially not have such 3144 * internal allocations. Therefore we would initially not have such
3136 * check here, since direct calls to the page allocator that are 3145 * check here, since direct calls to the page allocator that are
3137 * accounted to kmemcg (alloc_kmem_pages and friends) only happen 3146 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
3138 * outside memcg core. We are mostly concerned with cache allocations, 3147 * outside memcg core. We are mostly concerned with cache allocations,
3139 * and by having this test at memcg_kmem_get_cache, we are already able 3148 * and by having this test at memcg_kmem_get_cache, we are already able
3140 * to relay the allocation to the root cache and bypass the memcg cache 3149 * to relay the allocation to the root cache and bypass the memcg cache
3141 * altogether. 3150 * altogether.
3142 * 3151 *
3143 * There is one exception, though: the SLUB allocator does not create 3152 * There is one exception, though: the SLUB allocator does not create
3144 * large order caches, but rather service large kmallocs directly from 3153 * large order caches, but rather service large kmallocs directly from
3145 * the page allocator. Therefore, the following sequence when backed by 3154 * the page allocator. Therefore, the following sequence when backed by
3146 * the SLUB allocator: 3155 * the SLUB allocator:
3147 * 3156 *
3148 * memcg_stop_kmem_account(); 3157 * memcg_stop_kmem_account();
3149 * kmalloc(<large_number>) 3158 * kmalloc(<large_number>)
3150 * memcg_resume_kmem_account(); 3159 * memcg_resume_kmem_account();
3151 * 3160 *
3152 * would effectively ignore the fact that we should skip accounting, 3161 * would effectively ignore the fact that we should skip accounting,
3153 * since it will drive us directly to this function without passing 3162 * since it will drive us directly to this function without passing
3154 * through the cache selector memcg_kmem_get_cache. Such large 3163 * through the cache selector memcg_kmem_get_cache. Such large
3155 * allocations are extremely rare but can happen, for instance, for the 3164 * allocations are extremely rare but can happen, for instance, for the
3156 * cache arrays. We bring this test here. 3165 * cache arrays. We bring this test here.
3157 */ 3166 */
3158 if (!current->mm || current->memcg_kmem_skip_account) 3167 if (!current->mm || current->memcg_kmem_skip_account)
3159 return true; 3168 return true;
3160 3169
3161 memcg = get_mem_cgroup_from_mm(current->mm); 3170 memcg = get_mem_cgroup_from_mm(current->mm);
3162 3171
3163 if (!memcg_kmem_is_active(memcg)) { 3172 if (!memcg_kmem_is_active(memcg)) {
3164 css_put(&memcg->css); 3173 css_put(&memcg->css);
3165 return true; 3174 return true;
3166 } 3175 }
3167 3176
3168 ret = memcg_charge_kmem(memcg, gfp, 1 << order); 3177 ret = memcg_charge_kmem(memcg, gfp, 1 << order);
3169 if (!ret) 3178 if (!ret)
3170 *_memcg = memcg; 3179 *_memcg = memcg;
3171 3180
3172 css_put(&memcg->css); 3181 css_put(&memcg->css);
3173 return (ret == 0); 3182 return (ret == 0);
3174 } 3183 }
3175 3184
3176 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3185 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3177 int order) 3186 int order)
3178 { 3187 {
3179 struct page_cgroup *pc; 3188 struct page_cgroup *pc;
3180 3189
3181 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3190 VM_BUG_ON(mem_cgroup_is_root(memcg));
3182 3191
3183 /* The page allocation failed. Revert */ 3192 /* The page allocation failed. Revert */
3184 if (!page) { 3193 if (!page) {
3185 memcg_uncharge_kmem(memcg, 1 << order); 3194 memcg_uncharge_kmem(memcg, 1 << order);
3186 return; 3195 return;
3187 } 3196 }
3188 /* 3197 /*
3189 * The page is freshly allocated and not visible to any 3198 * The page is freshly allocated and not visible to any
3190 * outside callers yet. Set up pc non-atomically. 3199 * outside callers yet. Set up pc non-atomically.
3191 */ 3200 */
3192 pc = lookup_page_cgroup(page); 3201 pc = lookup_page_cgroup(page);
3193 pc->mem_cgroup = memcg; 3202 pc->mem_cgroup = memcg;
3194 pc->flags = PCG_USED; 3203 pc->flags = PCG_USED;
3195 } 3204 }
3196 3205
3197 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3206 void __memcg_kmem_uncharge_pages(struct page *page, int order)
3198 { 3207 {
3199 struct mem_cgroup *memcg = NULL; 3208 struct mem_cgroup *memcg = NULL;
3200 struct page_cgroup *pc; 3209 struct page_cgroup *pc;
3201 3210
3202 3211
3203 pc = lookup_page_cgroup(page); 3212 pc = lookup_page_cgroup(page);
3204 if (!PageCgroupUsed(pc)) 3213 if (!PageCgroupUsed(pc))
3205 return; 3214 return;
3206 3215
3207 memcg = pc->mem_cgroup; 3216 memcg = pc->mem_cgroup;
3208 pc->flags = 0; 3217 pc->flags = 0;
3209 3218
3210 /* 3219 /*
3211 * We trust that only if there is a memcg associated with the page, it 3220 * We trust that only if there is a memcg associated with the page, it
3212 * is a valid allocation 3221 * is a valid allocation
3213 */ 3222 */
3214 if (!memcg) 3223 if (!memcg)
3215 return; 3224 return;
3216 3225
3217 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3226 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3218 memcg_uncharge_kmem(memcg, 1 << order); 3227 memcg_uncharge_kmem(memcg, 1 << order);
3219 } 3228 }
3220 #else 3229 #else
3221 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) 3230 static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3222 { 3231 {
3223 } 3232 }
3224 #endif /* CONFIG_MEMCG_KMEM */ 3233 #endif /* CONFIG_MEMCG_KMEM */
3225 3234
3226 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3235 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3227 3236
3228 /* 3237 /*
3229 * Because tail pages are not marked as "used", set it. We're under 3238 * Because tail pages are not marked as "used", set it. We're under
3230 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3239 * zone->lru_lock, 'splitting on pmd' and compound_lock.
3231 * charge/uncharge will be never happen and move_account() is done under 3240 * charge/uncharge will be never happen and move_account() is done under
3232 * compound_lock(), so we don't have to take care of races. 3241 * compound_lock(), so we don't have to take care of races.
3233 */ 3242 */
3234 void mem_cgroup_split_huge_fixup(struct page *head) 3243 void mem_cgroup_split_huge_fixup(struct page *head)
3235 { 3244 {
3236 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3245 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3237 struct page_cgroup *pc; 3246 struct page_cgroup *pc;
3238 struct mem_cgroup *memcg; 3247 struct mem_cgroup *memcg;
3239 int i; 3248 int i;
3240 3249
3241 if (mem_cgroup_disabled()) 3250 if (mem_cgroup_disabled())
3242 return; 3251 return;
3243 3252
3244 memcg = head_pc->mem_cgroup; 3253 memcg = head_pc->mem_cgroup;
3245 for (i = 1; i < HPAGE_PMD_NR; i++) { 3254 for (i = 1; i < HPAGE_PMD_NR; i++) {
3246 pc = head_pc + i; 3255 pc = head_pc + i;
3247 pc->mem_cgroup = memcg; 3256 pc->mem_cgroup = memcg;
3248 pc->flags = head_pc->flags; 3257 pc->flags = head_pc->flags;
3249 } 3258 }
3250 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3259 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3251 HPAGE_PMD_NR); 3260 HPAGE_PMD_NR);
3252 } 3261 }
3253 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3262 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3254 3263
3255 /** 3264 /**
3256 * mem_cgroup_move_account - move account of the page 3265 * mem_cgroup_move_account - move account of the page
3257 * @page: the page 3266 * @page: the page
3258 * @nr_pages: number of regular pages (>1 for huge pages) 3267 * @nr_pages: number of regular pages (>1 for huge pages)
3259 * @pc: page_cgroup of the page. 3268 * @pc: page_cgroup of the page.
3260 * @from: mem_cgroup which the page is moved from. 3269 * @from: mem_cgroup which the page is moved from.
3261 * @to: mem_cgroup which the page is moved to. @from != @to. 3270 * @to: mem_cgroup which the page is moved to. @from != @to.
3262 * 3271 *
3263 * The caller must confirm following. 3272 * The caller must confirm following.
3264 * - page is not on LRU (isolate_page() is useful.) 3273 * - page is not on LRU (isolate_page() is useful.)
3265 * - compound_lock is held when nr_pages > 1 3274 * - compound_lock is held when nr_pages > 1
3266 * 3275 *
3267 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3276 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
3268 * from old cgroup. 3277 * from old cgroup.
3269 */ 3278 */
3270 static int mem_cgroup_move_account(struct page *page, 3279 static int mem_cgroup_move_account(struct page *page,
3271 unsigned int nr_pages, 3280 unsigned int nr_pages,
3272 struct page_cgroup *pc, 3281 struct page_cgroup *pc,
3273 struct mem_cgroup *from, 3282 struct mem_cgroup *from,
3274 struct mem_cgroup *to) 3283 struct mem_cgroup *to)
3275 { 3284 {
3276 unsigned long flags; 3285 unsigned long flags;
3277 int ret; 3286 int ret;
3278 3287
3279 VM_BUG_ON(from == to); 3288 VM_BUG_ON(from == to);
3280 VM_BUG_ON_PAGE(PageLRU(page), page); 3289 VM_BUG_ON_PAGE(PageLRU(page), page);
3281 /* 3290 /*
3282 * The page is isolated from LRU. So, collapse function 3291 * The page is isolated from LRU. So, collapse function
3283 * will not handle this page. But page splitting can happen. 3292 * will not handle this page. But page splitting can happen.
3284 * Do this check under compound_page_lock(). The caller should 3293 * Do this check under compound_page_lock(). The caller should
3285 * hold it. 3294 * hold it.
3286 */ 3295 */
3287 ret = -EBUSY; 3296 ret = -EBUSY;
3288 if (nr_pages > 1 && !PageTransHuge(page)) 3297 if (nr_pages > 1 && !PageTransHuge(page))
3289 goto out; 3298 goto out;
3290 3299
3291 /* 3300 /*
3292 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup 3301 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
3293 * of its source page while we change it: page migration takes 3302 * of its source page while we change it: page migration takes
3294 * both pages off the LRU, but page cache replacement doesn't. 3303 * both pages off the LRU, but page cache replacement doesn't.
3295 */ 3304 */
3296 if (!trylock_page(page)) 3305 if (!trylock_page(page))
3297 goto out; 3306 goto out;
3298 3307
3299 ret = -EINVAL; 3308 ret = -EINVAL;
3300 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3309 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3301 goto out_unlock; 3310 goto out_unlock;
3302 3311
3303 move_lock_mem_cgroup(from, &flags); 3312 move_lock_mem_cgroup(from, &flags);
3304 3313
3305 if (!PageAnon(page) && page_mapped(page)) { 3314 if (!PageAnon(page) && page_mapped(page)) {
3306 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3315 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3307 nr_pages); 3316 nr_pages);
3308 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3317 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3309 nr_pages); 3318 nr_pages);
3310 } 3319 }
3311 3320
3312 if (PageWriteback(page)) { 3321 if (PageWriteback(page)) {
3313 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3322 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3314 nr_pages); 3323 nr_pages);
3315 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3324 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3316 nr_pages); 3325 nr_pages);
3317 } 3326 }
3318 3327
3319 /* 3328 /*
3320 * It is safe to change pc->mem_cgroup here because the page 3329 * It is safe to change pc->mem_cgroup here because the page
3321 * is referenced, charged, and isolated - we can't race with 3330 * is referenced, charged, and isolated - we can't race with
3322 * uncharging, charging, migration, or LRU putback. 3331 * uncharging, charging, migration, or LRU putback.
3323 */ 3332 */
3324 3333
3325 /* caller should have done css_get */ 3334 /* caller should have done css_get */
3326 pc->mem_cgroup = to; 3335 pc->mem_cgroup = to;
3327 move_unlock_mem_cgroup(from, &flags); 3336 move_unlock_mem_cgroup(from, &flags);
3328 ret = 0; 3337 ret = 0;
3329 3338
3330 local_irq_disable(); 3339 local_irq_disable();
3331 mem_cgroup_charge_statistics(to, page, nr_pages); 3340 mem_cgroup_charge_statistics(to, page, nr_pages);
3332 memcg_check_events(to, page); 3341 memcg_check_events(to, page);
3333 mem_cgroup_charge_statistics(from, page, -nr_pages); 3342 mem_cgroup_charge_statistics(from, page, -nr_pages);
3334 memcg_check_events(from, page); 3343 memcg_check_events(from, page);
3335 local_irq_enable(); 3344 local_irq_enable();
3336 out_unlock: 3345 out_unlock:
3337 unlock_page(page); 3346 unlock_page(page);
3338 out: 3347 out:
3339 return ret; 3348 return ret;
3340 } 3349 }
3341 3350
3342 /** 3351 /**
3343 * mem_cgroup_move_parent - moves page to the parent group 3352 * mem_cgroup_move_parent - moves page to the parent group
3344 * @page: the page to move 3353 * @page: the page to move
3345 * @pc: page_cgroup of the page 3354 * @pc: page_cgroup of the page
3346 * @child: page's cgroup 3355 * @child: page's cgroup
3347 * 3356 *
3348 * move charges to its parent or the root cgroup if the group has no 3357 * move charges to its parent or the root cgroup if the group has no
3349 * parent (aka use_hierarchy==0). 3358 * parent (aka use_hierarchy==0).
3350 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3359 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3351 * mem_cgroup_move_account fails) the failure is always temporary and 3360 * mem_cgroup_move_account fails) the failure is always temporary and
3352 * it signals a race with a page removal/uncharge or migration. In the 3361 * it signals a race with a page removal/uncharge or migration. In the
3353 * first case the page is on the way out and it will vanish from the LRU 3362 * first case the page is on the way out and it will vanish from the LRU
3354 * on the next attempt and the call should be retried later. 3363 * on the next attempt and the call should be retried later.
3355 * Isolation from the LRU fails only if page has been isolated from 3364 * Isolation from the LRU fails only if page has been isolated from
3356 * the LRU since we looked at it and that usually means either global 3365 * the LRU since we looked at it and that usually means either global
3357 * reclaim or migration going on. The page will either get back to the 3366 * reclaim or migration going on. The page will either get back to the
3358 * LRU or vanish. 3367 * LRU or vanish.
3359 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3368 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3360 * (!PageCgroupUsed) or moved to a different group. The page will 3369 * (!PageCgroupUsed) or moved to a different group. The page will
3361 * disappear in the next attempt. 3370 * disappear in the next attempt.
3362 */ 3371 */
3363 static int mem_cgroup_move_parent(struct page *page, 3372 static int mem_cgroup_move_parent(struct page *page,
3364 struct page_cgroup *pc, 3373 struct page_cgroup *pc,
3365 struct mem_cgroup *child) 3374 struct mem_cgroup *child)
3366 { 3375 {
3367 struct mem_cgroup *parent; 3376 struct mem_cgroup *parent;
3368 unsigned int nr_pages; 3377 unsigned int nr_pages;
3369 unsigned long uninitialized_var(flags); 3378 unsigned long uninitialized_var(flags);
3370 int ret; 3379 int ret;
3371 3380
3372 VM_BUG_ON(mem_cgroup_is_root(child)); 3381 VM_BUG_ON(mem_cgroup_is_root(child));
3373 3382
3374 ret = -EBUSY; 3383 ret = -EBUSY;
3375 if (!get_page_unless_zero(page)) 3384 if (!get_page_unless_zero(page))
3376 goto out; 3385 goto out;
3377 if (isolate_lru_page(page)) 3386 if (isolate_lru_page(page))
3378 goto put; 3387 goto put;
3379 3388
3380 nr_pages = hpage_nr_pages(page); 3389 nr_pages = hpage_nr_pages(page);
3381 3390
3382 parent = parent_mem_cgroup(child); 3391 parent = parent_mem_cgroup(child);
3383 /* 3392 /*
3384 * If no parent, move charges to root cgroup. 3393 * If no parent, move charges to root cgroup.
3385 */ 3394 */
3386 if (!parent) 3395 if (!parent)
3387 parent = root_mem_cgroup; 3396 parent = root_mem_cgroup;
3388 3397
3389 if (nr_pages > 1) { 3398 if (nr_pages > 1) {
3390 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3399 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3391 flags = compound_lock_irqsave(page); 3400 flags = compound_lock_irqsave(page);
3392 } 3401 }
3393 3402
3394 ret = mem_cgroup_move_account(page, nr_pages, 3403 ret = mem_cgroup_move_account(page, nr_pages,
3395 pc, child, parent); 3404 pc, child, parent);
3396 if (!ret) { 3405 if (!ret) {
3406 if (!mem_cgroup_is_root(parent))
3407 css_get_many(&parent->css, nr_pages);
3397 /* Take charge off the local counters */ 3408 /* Take charge off the local counters */
3398 page_counter_cancel(&child->memory, nr_pages); 3409 page_counter_cancel(&child->memory, nr_pages);
3399 if (do_swap_account) 3410 if (do_swap_account)
3400 page_counter_cancel(&child->memsw, nr_pages); 3411 page_counter_cancel(&child->memsw, nr_pages);
3412 css_put_many(&child->css, nr_pages);
3401 } 3413 }
3402 3414
3403 if (nr_pages > 1) 3415 if (nr_pages > 1)
3404 compound_unlock_irqrestore(page, flags); 3416 compound_unlock_irqrestore(page, flags);
3405 putback_lru_page(page); 3417 putback_lru_page(page);
3406 put: 3418 put:
3407 put_page(page); 3419 put_page(page);
3408 out: 3420 out:
3409 return ret; 3421 return ret;
3410 } 3422 }
3411 3423
3412 #ifdef CONFIG_MEMCG_SWAP 3424 #ifdef CONFIG_MEMCG_SWAP
3413 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 3425 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
3414 bool charge) 3426 bool charge)
3415 { 3427 {
3416 int val = (charge) ? 1 : -1; 3428 int val = (charge) ? 1 : -1;
3417 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 3429 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
3418 } 3430 }
3419 3431
3420 /** 3432 /**
3421 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3433 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3422 * @entry: swap entry to be moved 3434 * @entry: swap entry to be moved
3423 * @from: mem_cgroup which the entry is moved from 3435 * @from: mem_cgroup which the entry is moved from
3424 * @to: mem_cgroup which the entry is moved to 3436 * @to: mem_cgroup which the entry is moved to
3425 * 3437 *
3426 * It succeeds only when the swap_cgroup's record for this entry is the same 3438 * It succeeds only when the swap_cgroup's record for this entry is the same
3427 * as the mem_cgroup's id of @from. 3439 * as the mem_cgroup's id of @from.
3428 * 3440 *
3429 * Returns 0 on success, -EINVAL on failure. 3441 * Returns 0 on success, -EINVAL on failure.
3430 * 3442 *
3431 * The caller must have charged to @to, IOW, called page_counter_charge() about 3443 * The caller must have charged to @to, IOW, called page_counter_charge() about
3432 * both res and memsw, and called css_get(). 3444 * both res and memsw, and called css_get().
3433 */ 3445 */
3434 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3446 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3435 struct mem_cgroup *from, struct mem_cgroup *to) 3447 struct mem_cgroup *from, struct mem_cgroup *to)
3436 { 3448 {
3437 unsigned short old_id, new_id; 3449 unsigned short old_id, new_id;
3438 3450
3439 old_id = mem_cgroup_id(from); 3451 old_id = mem_cgroup_id(from);
3440 new_id = mem_cgroup_id(to); 3452 new_id = mem_cgroup_id(to);
3441 3453
3442 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3454 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3443 mem_cgroup_swap_statistics(from, false); 3455 mem_cgroup_swap_statistics(from, false);
3444 mem_cgroup_swap_statistics(to, true); 3456 mem_cgroup_swap_statistics(to, true);
3445 /* 3457 /*
3446 * This function is only called from task migration context now. 3458 * This function is only called from task migration context now.
3447 * It postpones page_counter and refcount handling till the end 3459 * It postpones page_counter and refcount handling till the end
3448 * of task migration(mem_cgroup_clear_mc()) for performance 3460 * of task migration(mem_cgroup_clear_mc()) for performance
3449 * improvement. But we cannot postpone css_get(to) because if 3461 * improvement. But we cannot postpone css_get(to) because if
3450 * the process that has been moved to @to does swap-in, the 3462 * the process that has been moved to @to does swap-in, the
3451 * refcount of @to might be decreased to 0. 3463 * refcount of @to might be decreased to 0.
3452 * 3464 *
3453 * We are in attach() phase, so the cgroup is guaranteed to be 3465 * We are in attach() phase, so the cgroup is guaranteed to be
3454 * alive, so we can just call css_get(). 3466 * alive, so we can just call css_get().
3455 */ 3467 */
3456 css_get(&to->css); 3468 css_get(&to->css);
3457 return 0; 3469 return 0;
3458 } 3470 }
3459 return -EINVAL; 3471 return -EINVAL;
3460 } 3472 }
3461 #else 3473 #else
3462 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3474 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3463 struct mem_cgroup *from, struct mem_cgroup *to) 3475 struct mem_cgroup *from, struct mem_cgroup *to)
3464 { 3476 {
3465 return -EINVAL; 3477 return -EINVAL;
3466 } 3478 }
3467 #endif 3479 #endif
3468 3480
3469 #ifdef CONFIG_DEBUG_VM 3481 #ifdef CONFIG_DEBUG_VM
3470 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3482 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3471 { 3483 {
3472 struct page_cgroup *pc; 3484 struct page_cgroup *pc;
3473 3485
3474 pc = lookup_page_cgroup(page); 3486 pc = lookup_page_cgroup(page);
3475 /* 3487 /*
3476 * Can be NULL while feeding pages into the page allocator for 3488 * Can be NULL while feeding pages into the page allocator for
3477 * the first time, i.e. during boot or memory hotplug; 3489 * the first time, i.e. during boot or memory hotplug;
3478 * or when mem_cgroup_disabled(). 3490 * or when mem_cgroup_disabled().
3479 */ 3491 */
3480 if (likely(pc) && PageCgroupUsed(pc)) 3492 if (likely(pc) && PageCgroupUsed(pc))
3481 return pc; 3493 return pc;
3482 return NULL; 3494 return NULL;
3483 } 3495 }
3484 3496
3485 bool mem_cgroup_bad_page_check(struct page *page) 3497 bool mem_cgroup_bad_page_check(struct page *page)
3486 { 3498 {
3487 if (mem_cgroup_disabled()) 3499 if (mem_cgroup_disabled())
3488 return false; 3500 return false;
3489 3501
3490 return lookup_page_cgroup_used(page) != NULL; 3502 return lookup_page_cgroup_used(page) != NULL;
3491 } 3503 }
3492 3504
3493 void mem_cgroup_print_bad_page(struct page *page) 3505 void mem_cgroup_print_bad_page(struct page *page)
3494 { 3506 {
3495 struct page_cgroup *pc; 3507 struct page_cgroup *pc;
3496 3508
3497 pc = lookup_page_cgroup_used(page); 3509 pc = lookup_page_cgroup_used(page);
3498 if (pc) { 3510 if (pc) {
3499 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 3511 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
3500 pc, pc->flags, pc->mem_cgroup); 3512 pc, pc->flags, pc->mem_cgroup);
3501 } 3513 }
3502 } 3514 }
3503 #endif 3515 #endif
3504 3516
3505 static DEFINE_MUTEX(memcg_limit_mutex); 3517 static DEFINE_MUTEX(memcg_limit_mutex);
3506 3518
3507 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3519 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3508 unsigned long limit) 3520 unsigned long limit)
3509 { 3521 {
3510 unsigned long curusage; 3522 unsigned long curusage;
3511 unsigned long oldusage; 3523 unsigned long oldusage;
3512 bool enlarge = false; 3524 bool enlarge = false;
3513 int retry_count; 3525 int retry_count;
3514 int ret; 3526 int ret;
3515 3527
3516 /* 3528 /*
3517 * For keeping hierarchical_reclaim simple, how long we should retry 3529 * For keeping hierarchical_reclaim simple, how long we should retry
3518 * is depends on callers. We set our retry-count to be function 3530 * is depends on callers. We set our retry-count to be function
3519 * of # of children which we should visit in this loop. 3531 * of # of children which we should visit in this loop.
3520 */ 3532 */
3521 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3533 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3522 mem_cgroup_count_children(memcg); 3534 mem_cgroup_count_children(memcg);
3523 3535
3524 oldusage = page_counter_read(&memcg->memory); 3536 oldusage = page_counter_read(&memcg->memory);
3525 3537
3526 do { 3538 do {
3527 if (signal_pending(current)) { 3539 if (signal_pending(current)) {
3528 ret = -EINTR; 3540 ret = -EINTR;
3529 break; 3541 break;
3530 } 3542 }
3531 3543
3532 mutex_lock(&memcg_limit_mutex); 3544 mutex_lock(&memcg_limit_mutex);
3533 if (limit > memcg->memsw.limit) { 3545 if (limit > memcg->memsw.limit) {
3534 mutex_unlock(&memcg_limit_mutex); 3546 mutex_unlock(&memcg_limit_mutex);
3535 ret = -EINVAL; 3547 ret = -EINVAL;
3536 break; 3548 break;
3537 } 3549 }
3538 if (limit > memcg->memory.limit) 3550 if (limit > memcg->memory.limit)
3539 enlarge = true; 3551 enlarge = true;
3540 ret = page_counter_limit(&memcg->memory, limit); 3552 ret = page_counter_limit(&memcg->memory, limit);
3541 mutex_unlock(&memcg_limit_mutex); 3553 mutex_unlock(&memcg_limit_mutex);
3542 3554
3543 if (!ret) 3555 if (!ret)
3544 break; 3556 break;
3545 3557
3546 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); 3558 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
3547 3559
3548 curusage = page_counter_read(&memcg->memory); 3560 curusage = page_counter_read(&memcg->memory);
3549 /* Usage is reduced ? */ 3561 /* Usage is reduced ? */
3550 if (curusage >= oldusage) 3562 if (curusage >= oldusage)
3551 retry_count--; 3563 retry_count--;
3552 else 3564 else
3553 oldusage = curusage; 3565 oldusage = curusage;
3554 } while (retry_count); 3566 } while (retry_count);
3555 3567
3556 if (!ret && enlarge) 3568 if (!ret && enlarge)
3557 memcg_oom_recover(memcg); 3569 memcg_oom_recover(memcg);
3558 3570
3559 return ret; 3571 return ret;
3560 } 3572 }
3561 3573
3562 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3574 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3563 unsigned long limit) 3575 unsigned long limit)
3564 { 3576 {
3565 unsigned long curusage; 3577 unsigned long curusage;
3566 unsigned long oldusage; 3578 unsigned long oldusage;
3567 bool enlarge = false; 3579 bool enlarge = false;
3568 int retry_count; 3580 int retry_count;
3569 int ret; 3581 int ret;
3570 3582
3571 /* see mem_cgroup_resize_res_limit */ 3583 /* see mem_cgroup_resize_res_limit */
3572 retry_count = MEM_CGROUP_RECLAIM_RETRIES * 3584 retry_count = MEM_CGROUP_RECLAIM_RETRIES *
3573 mem_cgroup_count_children(memcg); 3585 mem_cgroup_count_children(memcg);
3574 3586
3575 oldusage = page_counter_read(&memcg->memsw); 3587 oldusage = page_counter_read(&memcg->memsw);
3576 3588
3577 do { 3589 do {
3578 if (signal_pending(current)) { 3590 if (signal_pending(current)) {
3579 ret = -EINTR; 3591 ret = -EINTR;
3580 break; 3592 break;
3581 } 3593 }
3582 3594
3583 mutex_lock(&memcg_limit_mutex); 3595 mutex_lock(&memcg_limit_mutex);
3584 if (limit < memcg->memory.limit) { 3596 if (limit < memcg->memory.limit) {
3585 mutex_unlock(&memcg_limit_mutex); 3597 mutex_unlock(&memcg_limit_mutex);
3586 ret = -EINVAL; 3598 ret = -EINVAL;
3587 break; 3599 break;
3588 } 3600 }
3589 if (limit > memcg->memsw.limit) 3601 if (limit > memcg->memsw.limit)
3590 enlarge = true; 3602 enlarge = true;
3591 ret = page_counter_limit(&memcg->memsw, limit); 3603 ret = page_counter_limit(&memcg->memsw, limit);
3592 mutex_unlock(&memcg_limit_mutex); 3604 mutex_unlock(&memcg_limit_mutex);
3593 3605
3594 if (!ret) 3606 if (!ret)
3595 break; 3607 break;
3596 3608
3597 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); 3609 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
3598 3610
3599 curusage = page_counter_read(&memcg->memsw); 3611 curusage = page_counter_read(&memcg->memsw);
3600 /* Usage is reduced ? */ 3612 /* Usage is reduced ? */
3601 if (curusage >= oldusage) 3613 if (curusage >= oldusage)
3602 retry_count--; 3614 retry_count--;
3603 else 3615 else
3604 oldusage = curusage; 3616 oldusage = curusage;
3605 } while (retry_count); 3617 } while (retry_count);
3606 3618
3607 if (!ret && enlarge) 3619 if (!ret && enlarge)
3608 memcg_oom_recover(memcg); 3620 memcg_oom_recover(memcg);
3609 3621
3610 return ret; 3622 return ret;
3611 } 3623 }
3612 3624
3613 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3625 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3614 gfp_t gfp_mask, 3626 gfp_t gfp_mask,
3615 unsigned long *total_scanned) 3627 unsigned long *total_scanned)
3616 { 3628 {
3617 unsigned long nr_reclaimed = 0; 3629 unsigned long nr_reclaimed = 0;
3618 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3630 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3619 unsigned long reclaimed; 3631 unsigned long reclaimed;
3620 int loop = 0; 3632 int loop = 0;
3621 struct mem_cgroup_tree_per_zone *mctz; 3633 struct mem_cgroup_tree_per_zone *mctz;
3622 unsigned long excess; 3634 unsigned long excess;
3623 unsigned long nr_scanned; 3635 unsigned long nr_scanned;
3624 3636
3625 if (order > 0) 3637 if (order > 0)
3626 return 0; 3638 return 0;
3627 3639
3628 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3640 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3629 /* 3641 /*
3630 * This loop can run a while, specially if mem_cgroup's continuously 3642 * This loop can run a while, specially if mem_cgroup's continuously
3631 * keep exceeding their soft limit and putting the system under 3643 * keep exceeding their soft limit and putting the system under
3632 * pressure 3644 * pressure
3633 */ 3645 */
3634 do { 3646 do {
3635 if (next_mz) 3647 if (next_mz)
3636 mz = next_mz; 3648 mz = next_mz;
3637 else 3649 else
3638 mz = mem_cgroup_largest_soft_limit_node(mctz); 3650 mz = mem_cgroup_largest_soft_limit_node(mctz);
3639 if (!mz) 3651 if (!mz)
3640 break; 3652 break;
3641 3653
3642 nr_scanned = 0; 3654 nr_scanned = 0;
3643 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 3655 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
3644 gfp_mask, &nr_scanned); 3656 gfp_mask, &nr_scanned);
3645 nr_reclaimed += reclaimed; 3657 nr_reclaimed += reclaimed;
3646 *total_scanned += nr_scanned; 3658 *total_scanned += nr_scanned;
3647 spin_lock_irq(&mctz->lock); 3659 spin_lock_irq(&mctz->lock);
3648 3660
3649 /* 3661 /*
3650 * If we failed to reclaim anything from this memory cgroup 3662 * If we failed to reclaim anything from this memory cgroup
3651 * it is time to move on to the next cgroup 3663 * it is time to move on to the next cgroup
3652 */ 3664 */
3653 next_mz = NULL; 3665 next_mz = NULL;
3654 if (!reclaimed) { 3666 if (!reclaimed) {
3655 do { 3667 do {
3656 /* 3668 /*
3657 * Loop until we find yet another one. 3669 * Loop until we find yet another one.
3658 * 3670 *
3659 * By the time we get the soft_limit lock 3671 * By the time we get the soft_limit lock
3660 * again, someone might have aded the 3672 * again, someone might have aded the
3661 * group back on the RB tree. Iterate to 3673 * group back on the RB tree. Iterate to
3662 * make sure we get a different mem. 3674 * make sure we get a different mem.
3663 * mem_cgroup_largest_soft_limit_node returns 3675 * mem_cgroup_largest_soft_limit_node returns
3664 * NULL if no other cgroup is present on 3676 * NULL if no other cgroup is present on
3665 * the tree 3677 * the tree
3666 */ 3678 */
3667 next_mz = 3679 next_mz =
3668 __mem_cgroup_largest_soft_limit_node(mctz); 3680 __mem_cgroup_largest_soft_limit_node(mctz);
3669 if (next_mz == mz) 3681 if (next_mz == mz)
3670 css_put(&next_mz->memcg->css); 3682 css_put(&next_mz->memcg->css);
3671 else /* next_mz == NULL or other memcg */ 3683 else /* next_mz == NULL or other memcg */
3672 break; 3684 break;
3673 } while (1); 3685 } while (1);
3674 } 3686 }
3675 __mem_cgroup_remove_exceeded(mz, mctz); 3687 __mem_cgroup_remove_exceeded(mz, mctz);
3676 excess = soft_limit_excess(mz->memcg); 3688 excess = soft_limit_excess(mz->memcg);
3677 /* 3689 /*
3678 * One school of thought says that we should not add 3690 * One school of thought says that we should not add
3679 * back the node to the tree if reclaim returns 0. 3691 * back the node to the tree if reclaim returns 0.
3680 * But our reclaim could return 0, simply because due 3692 * But our reclaim could return 0, simply because due
3681 * to priority we are exposing a smaller subset of 3693 * to priority we are exposing a smaller subset of
3682 * memory to reclaim from. Consider this as a longer 3694 * memory to reclaim from. Consider this as a longer
3683 * term TODO. 3695 * term TODO.
3684 */ 3696 */
3685 /* If excess == 0, no tree ops */ 3697 /* If excess == 0, no tree ops */
3686 __mem_cgroup_insert_exceeded(mz, mctz, excess); 3698 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3687 spin_unlock_irq(&mctz->lock); 3699 spin_unlock_irq(&mctz->lock);
3688 css_put(&mz->memcg->css); 3700 css_put(&mz->memcg->css);
3689 loop++; 3701 loop++;
3690 /* 3702 /*
3691 * Could not reclaim anything and there are no more 3703 * Could not reclaim anything and there are no more
3692 * mem cgroups to try or we seem to be looping without 3704 * mem cgroups to try or we seem to be looping without
3693 * reclaiming anything. 3705 * reclaiming anything.
3694 */ 3706 */
3695 if (!nr_reclaimed && 3707 if (!nr_reclaimed &&
3696 (next_mz == NULL || 3708 (next_mz == NULL ||
3697 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3709 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3698 break; 3710 break;
3699 } while (!nr_reclaimed); 3711 } while (!nr_reclaimed);
3700 if (next_mz) 3712 if (next_mz)
3701 css_put(&next_mz->memcg->css); 3713 css_put(&next_mz->memcg->css);
3702 return nr_reclaimed; 3714 return nr_reclaimed;
3703 } 3715 }
3704 3716
3705 /** 3717 /**
3706 * mem_cgroup_force_empty_list - clears LRU of a group 3718 * mem_cgroup_force_empty_list - clears LRU of a group
3707 * @memcg: group to clear 3719 * @memcg: group to clear
3708 * @node: NUMA node 3720 * @node: NUMA node
3709 * @zid: zone id 3721 * @zid: zone id
3710 * @lru: lru to to clear 3722 * @lru: lru to to clear
3711 * 3723 *
3712 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 3724 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
3713 * reclaim the pages page themselves - pages are moved to the parent (or root) 3725 * reclaim the pages page themselves - pages are moved to the parent (or root)
3714 * group. 3726 * group.
3715 */ 3727 */
3716 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 3728 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
3717 int node, int zid, enum lru_list lru) 3729 int node, int zid, enum lru_list lru)
3718 { 3730 {
3719 struct lruvec *lruvec; 3731 struct lruvec *lruvec;
3720 unsigned long flags; 3732 unsigned long flags;
3721 struct list_head *list; 3733 struct list_head *list;
3722 struct page *busy; 3734 struct page *busy;
3723 struct zone *zone; 3735 struct zone *zone;
3724 3736
3725 zone = &NODE_DATA(node)->node_zones[zid]; 3737 zone = &NODE_DATA(node)->node_zones[zid];
3726 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 3738 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
3727 list = &lruvec->lists[lru]; 3739 list = &lruvec->lists[lru];
3728 3740
3729 busy = NULL; 3741 busy = NULL;
3730 do { 3742 do {
3731 struct page_cgroup *pc; 3743 struct page_cgroup *pc;
3732 struct page *page; 3744 struct page *page;
3733 3745
3734 spin_lock_irqsave(&zone->lru_lock, flags); 3746 spin_lock_irqsave(&zone->lru_lock, flags);
3735 if (list_empty(list)) { 3747 if (list_empty(list)) {
3736 spin_unlock_irqrestore(&zone->lru_lock, flags); 3748 spin_unlock_irqrestore(&zone->lru_lock, flags);
3737 break; 3749 break;
3738 } 3750 }
3739 page = list_entry(list->prev, struct page, lru); 3751 page = list_entry(list->prev, struct page, lru);
3740 if (busy == page) { 3752 if (busy == page) {
3741 list_move(&page->lru, list); 3753 list_move(&page->lru, list);
3742 busy = NULL; 3754 busy = NULL;
3743 spin_unlock_irqrestore(&zone->lru_lock, flags); 3755 spin_unlock_irqrestore(&zone->lru_lock, flags);
3744 continue; 3756 continue;
3745 } 3757 }
3746 spin_unlock_irqrestore(&zone->lru_lock, flags); 3758 spin_unlock_irqrestore(&zone->lru_lock, flags);
3747 3759
3748 pc = lookup_page_cgroup(page); 3760 pc = lookup_page_cgroup(page);
3749 3761
3750 if (mem_cgroup_move_parent(page, pc, memcg)) { 3762 if (mem_cgroup_move_parent(page, pc, memcg)) {
3751 /* found lock contention or "pc" is obsolete. */ 3763 /* found lock contention or "pc" is obsolete. */
3752 busy = page; 3764 busy = page;
3753 } else 3765 } else
3754 busy = NULL; 3766 busy = NULL;
3755 cond_resched(); 3767 cond_resched();
3756 } while (!list_empty(list)); 3768 } while (!list_empty(list));
3757 } 3769 }
3758 3770
3759 /* 3771 /*
3760 * make mem_cgroup's charge to be 0 if there is no task by moving 3772 * make mem_cgroup's charge to be 0 if there is no task by moving
3761 * all the charges and pages to the parent. 3773 * all the charges and pages to the parent.
3762 * This enables deleting this mem_cgroup. 3774 * This enables deleting this mem_cgroup.
3763 * 3775 *
3764 * Caller is responsible for holding css reference on the memcg. 3776 * Caller is responsible for holding css reference on the memcg.
3765 */ 3777 */
3766 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 3778 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
3767 { 3779 {
3768 int node, zid; 3780 int node, zid;
3769 3781
3770 do { 3782 do {
3771 /* This is for making all *used* pages to be on LRU. */ 3783 /* This is for making all *used* pages to be on LRU. */
3772 lru_add_drain_all(); 3784 lru_add_drain_all();
3773 drain_all_stock_sync(memcg); 3785 drain_all_stock_sync(memcg);
3774 mem_cgroup_start_move(memcg); 3786 mem_cgroup_start_move(memcg);
3775 for_each_node_state(node, N_MEMORY) { 3787 for_each_node_state(node, N_MEMORY) {
3776 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3788 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3777 enum lru_list lru; 3789 enum lru_list lru;
3778 for_each_lru(lru) { 3790 for_each_lru(lru) {
3779 mem_cgroup_force_empty_list(memcg, 3791 mem_cgroup_force_empty_list(memcg,
3780 node, zid, lru); 3792 node, zid, lru);
3781 } 3793 }
3782 } 3794 }
3783 } 3795 }
3784 mem_cgroup_end_move(memcg); 3796 mem_cgroup_end_move(memcg);
3785 memcg_oom_recover(memcg); 3797 memcg_oom_recover(memcg);
3786 cond_resched(); 3798 cond_resched();
3787 3799
3788 /* 3800 /*
3789 * Kernel memory may not necessarily be trackable to a specific 3801 * Kernel memory may not necessarily be trackable to a specific
3790 * process. So they are not migrated, and therefore we can't 3802 * process. So they are not migrated, and therefore we can't
3791 * expect their value to drop to 0 here. 3803 * expect their value to drop to 0 here.
3792 * Having res filled up with kmem only is enough. 3804 * Having res filled up with kmem only is enough.
3793 * 3805 *
3794 * This is a safety check because mem_cgroup_force_empty_list 3806 * This is a safety check because mem_cgroup_force_empty_list
3795 * could have raced with mem_cgroup_replace_page_cache callers 3807 * could have raced with mem_cgroup_replace_page_cache callers
3796 * so the lru seemed empty but the page could have been added 3808 * so the lru seemed empty but the page could have been added
3797 * right after the check. RES_USAGE should be safe as we always 3809 * right after the check. RES_USAGE should be safe as we always
3798 * charge before adding to the LRU. 3810 * charge before adding to the LRU.
3799 */ 3811 */
3800 } while (page_counter_read(&memcg->memory) - 3812 } while (page_counter_read(&memcg->memory) -
3801 page_counter_read(&memcg->kmem) > 0); 3813 page_counter_read(&memcg->kmem) > 0);
3802 } 3814 }
3803 3815
3804 /* 3816 /*
3805 * Test whether @memcg has children, dead or alive. Note that this 3817 * Test whether @memcg has children, dead or alive. Note that this
3806 * function doesn't care whether @memcg has use_hierarchy enabled and 3818 * function doesn't care whether @memcg has use_hierarchy enabled and
3807 * returns %true if there are child csses according to the cgroup 3819 * returns %true if there are child csses according to the cgroup
3808 * hierarchy. Testing use_hierarchy is the caller's responsiblity. 3820 * hierarchy. Testing use_hierarchy is the caller's responsiblity.
3809 */ 3821 */
3810 static inline bool memcg_has_children(struct mem_cgroup *memcg) 3822 static inline bool memcg_has_children(struct mem_cgroup *memcg)
3811 { 3823 {
3812 bool ret; 3824 bool ret;
3813 3825
3814 /* 3826 /*
3815 * The lock does not prevent addition or deletion of children, but 3827 * The lock does not prevent addition or deletion of children, but
3816 * it prevents a new child from being initialized based on this 3828 * it prevents a new child from being initialized based on this
3817 * parent in css_online(), so it's enough to decide whether 3829 * parent in css_online(), so it's enough to decide whether
3818 * hierarchically inherited attributes can still be changed or not. 3830 * hierarchically inherited attributes can still be changed or not.
3819 */ 3831 */
3820 lockdep_assert_held(&memcg_create_mutex); 3832 lockdep_assert_held(&memcg_create_mutex);
3821 3833
3822 rcu_read_lock(); 3834 rcu_read_lock();
3823 ret = css_next_child(NULL, &memcg->css); 3835 ret = css_next_child(NULL, &memcg->css);
3824 rcu_read_unlock(); 3836 rcu_read_unlock();
3825 return ret; 3837 return ret;
3826 } 3838 }
3827 3839
3828 /* 3840 /*
3829 * Reclaims as many pages from the given memcg as possible and moves 3841 * Reclaims as many pages from the given memcg as possible and moves
3830 * the rest to the parent. 3842 * the rest to the parent.
3831 * 3843 *
3832 * Caller is responsible for holding css reference for memcg. 3844 * Caller is responsible for holding css reference for memcg.
3833 */ 3845 */
3834 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 3846 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3835 { 3847 {
3836 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3848 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3837 3849
3838 /* we call try-to-free pages for make this cgroup empty */ 3850 /* we call try-to-free pages for make this cgroup empty */
3839 lru_add_drain_all(); 3851 lru_add_drain_all();
3840 /* try to free all pages in this cgroup */ 3852 /* try to free all pages in this cgroup */
3841 while (nr_retries && page_counter_read(&memcg->memory)) { 3853 while (nr_retries && page_counter_read(&memcg->memory)) {
3842 int progress; 3854 int progress;
3843 3855
3844 if (signal_pending(current)) 3856 if (signal_pending(current))
3845 return -EINTR; 3857 return -EINTR;
3846 3858
3847 progress = try_to_free_mem_cgroup_pages(memcg, 1, 3859 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3848 GFP_KERNEL, true); 3860 GFP_KERNEL, true);
3849 if (!progress) { 3861 if (!progress) {
3850 nr_retries--; 3862 nr_retries--;
3851 /* maybe some writeback is necessary */ 3863 /* maybe some writeback is necessary */
3852 congestion_wait(BLK_RW_ASYNC, HZ/10); 3864 congestion_wait(BLK_RW_ASYNC, HZ/10);
3853 } 3865 }
3854 3866
3855 } 3867 }
3856 3868
3857 return 0; 3869 return 0;
3858 } 3870 }
3859 3871
3860 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, 3872 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3861 char *buf, size_t nbytes, 3873 char *buf, size_t nbytes,
3862 loff_t off) 3874 loff_t off)
3863 { 3875 {
3864 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3876 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3865 3877
3866 if (mem_cgroup_is_root(memcg)) 3878 if (mem_cgroup_is_root(memcg))
3867 return -EINVAL; 3879 return -EINVAL;
3868 return mem_cgroup_force_empty(memcg) ?: nbytes; 3880 return mem_cgroup_force_empty(memcg) ?: nbytes;
3869 } 3881 }
3870 3882
3871 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 3883 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3872 struct cftype *cft) 3884 struct cftype *cft)
3873 { 3885 {
3874 return mem_cgroup_from_css(css)->use_hierarchy; 3886 return mem_cgroup_from_css(css)->use_hierarchy;
3875 } 3887 }
3876 3888
3877 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 3889 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3878 struct cftype *cft, u64 val) 3890 struct cftype *cft, u64 val)
3879 { 3891 {
3880 int retval = 0; 3892 int retval = 0;
3881 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3893 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3882 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); 3894 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3883 3895
3884 mutex_lock(&memcg_create_mutex); 3896 mutex_lock(&memcg_create_mutex);
3885 3897
3886 if (memcg->use_hierarchy == val) 3898 if (memcg->use_hierarchy == val)
3887 goto out; 3899 goto out;
3888 3900
3889 /* 3901 /*
3890 * If parent's use_hierarchy is set, we can't make any modifications 3902 * If parent's use_hierarchy is set, we can't make any modifications
3891 * in the child subtrees. If it is unset, then the change can 3903 * in the child subtrees. If it is unset, then the change can
3892 * occur, provided the current cgroup has no children. 3904 * occur, provided the current cgroup has no children.
3893 * 3905 *
3894 * For the root cgroup, parent_mem is NULL, we allow value to be 3906 * For the root cgroup, parent_mem is NULL, we allow value to be
3895 * set if there are no children. 3907 * set if there are no children.
3896 */ 3908 */
3897 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 3909 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3898 (val == 1 || val == 0)) { 3910 (val == 1 || val == 0)) {
3899 if (!memcg_has_children(memcg)) 3911 if (!memcg_has_children(memcg))
3900 memcg->use_hierarchy = val; 3912 memcg->use_hierarchy = val;
3901 else 3913 else
3902 retval = -EBUSY; 3914 retval = -EBUSY;
3903 } else 3915 } else
3904 retval = -EINVAL; 3916 retval = -EINVAL;
3905 3917
3906 out: 3918 out:
3907 mutex_unlock(&memcg_create_mutex); 3919 mutex_unlock(&memcg_create_mutex);
3908 3920
3909 return retval; 3921 return retval;
3910 } 3922 }
3911 3923
3912 static unsigned long tree_stat(struct mem_cgroup *memcg, 3924 static unsigned long tree_stat(struct mem_cgroup *memcg,
3913 enum mem_cgroup_stat_index idx) 3925 enum mem_cgroup_stat_index idx)
3914 { 3926 {
3915 struct mem_cgroup *iter; 3927 struct mem_cgroup *iter;
3916 long val = 0; 3928 long val = 0;
3917 3929
3918 /* Per-cpu values can be negative, use a signed accumulator */ 3930 /* Per-cpu values can be negative, use a signed accumulator */
3919 for_each_mem_cgroup_tree(iter, memcg) 3931 for_each_mem_cgroup_tree(iter, memcg)
3920 val += mem_cgroup_read_stat(iter, idx); 3932 val += mem_cgroup_read_stat(iter, idx);
3921 3933
3922 if (val < 0) /* race ? */ 3934 if (val < 0) /* race ? */
3923 val = 0; 3935 val = 0;
3924 return val; 3936 return val;
3925 } 3937 }
3926 3938
3927 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 3939 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3928 { 3940 {
3929 u64 val; 3941 u64 val;
3930 3942
3931 if (mem_cgroup_is_root(memcg)) { 3943 if (mem_cgroup_is_root(memcg)) {
3932 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); 3944 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
3933 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); 3945 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
3934 if (swap) 3946 if (swap)
3935 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); 3947 val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
3936 } else { 3948 } else {
3937 if (!swap) 3949 if (!swap)
3938 val = page_counter_read(&memcg->memory); 3950 val = page_counter_read(&memcg->memory);
3939 else 3951 else
3940 val = page_counter_read(&memcg->memsw); 3952 val = page_counter_read(&memcg->memsw);
3941 } 3953 }
3942 return val << PAGE_SHIFT; 3954 return val << PAGE_SHIFT;
3943 } 3955 }
3944 3956
3945 enum { 3957 enum {
3946 RES_USAGE, 3958 RES_USAGE,
3947 RES_LIMIT, 3959 RES_LIMIT,
3948 RES_MAX_USAGE, 3960 RES_MAX_USAGE,
3949 RES_FAILCNT, 3961 RES_FAILCNT,
3950 RES_SOFT_LIMIT, 3962 RES_SOFT_LIMIT,
3951 }; 3963 };
3952 3964
3953 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 3965 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3954 struct cftype *cft) 3966 struct cftype *cft)
3955 { 3967 {
3956 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3968 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3957 struct page_counter *counter; 3969 struct page_counter *counter;
3958 3970
3959 switch (MEMFILE_TYPE(cft->private)) { 3971 switch (MEMFILE_TYPE(cft->private)) {
3960 case _MEM: 3972 case _MEM:
3961 counter = &memcg->memory; 3973 counter = &memcg->memory;
3962 break; 3974 break;
3963 case _MEMSWAP: 3975 case _MEMSWAP:
3964 counter = &memcg->memsw; 3976 counter = &memcg->memsw;
3965 break; 3977 break;
3966 case _KMEM: 3978 case _KMEM:
3967 counter = &memcg->kmem; 3979 counter = &memcg->kmem;
3968 break; 3980 break;
3969 default: 3981 default:
3970 BUG(); 3982 BUG();
3971 } 3983 }
3972 3984
3973 switch (MEMFILE_ATTR(cft->private)) { 3985 switch (MEMFILE_ATTR(cft->private)) {
3974 case RES_USAGE: 3986 case RES_USAGE:
3975 if (counter == &memcg->memory) 3987 if (counter == &memcg->memory)
3976 return mem_cgroup_usage(memcg, false); 3988 return mem_cgroup_usage(memcg, false);
3977 if (counter == &memcg->memsw) 3989 if (counter == &memcg->memsw)
3978 return mem_cgroup_usage(memcg, true); 3990 return mem_cgroup_usage(memcg, true);
3979 return (u64)page_counter_read(counter) * PAGE_SIZE; 3991 return (u64)page_counter_read(counter) * PAGE_SIZE;
3980 case RES_LIMIT: 3992 case RES_LIMIT:
3981 return (u64)counter->limit * PAGE_SIZE; 3993 return (u64)counter->limit * PAGE_SIZE;
3982 case RES_MAX_USAGE: 3994 case RES_MAX_USAGE:
3983 return (u64)counter->watermark * PAGE_SIZE; 3995 return (u64)counter->watermark * PAGE_SIZE;
3984 case RES_FAILCNT: 3996 case RES_FAILCNT:
3985 return counter->failcnt; 3997 return counter->failcnt;
3986 case RES_SOFT_LIMIT: 3998 case RES_SOFT_LIMIT:
3987 return (u64)memcg->soft_limit * PAGE_SIZE; 3999 return (u64)memcg->soft_limit * PAGE_SIZE;
3988 default: 4000 default:
3989 BUG(); 4001 BUG();
3990 } 4002 }
3991 } 4003 }
3992 4004
3993 #ifdef CONFIG_MEMCG_KMEM 4005 #ifdef CONFIG_MEMCG_KMEM
3994 /* should be called with activate_kmem_mutex held */ 4006 /* should be called with activate_kmem_mutex held */
3995 static int __memcg_activate_kmem(struct mem_cgroup *memcg, 4007 static int __memcg_activate_kmem(struct mem_cgroup *memcg,
3996 unsigned long nr_pages) 4008 unsigned long nr_pages)
3997 { 4009 {
3998 int err = 0; 4010 int err = 0;
3999 int memcg_id; 4011 int memcg_id;
4000 4012
4001 if (memcg_kmem_is_active(memcg)) 4013 if (memcg_kmem_is_active(memcg))
4002 return 0; 4014 return 0;
4003 4015
4004 /* 4016 /*
4005 * We are going to allocate memory for data shared by all memory 4017 * We are going to allocate memory for data shared by all memory
4006 * cgroups so let's stop accounting here. 4018 * cgroups so let's stop accounting here.
4007 */ 4019 */
4008 memcg_stop_kmem_account(); 4020 memcg_stop_kmem_account();
4009 4021
4010 /* 4022 /*
4011 * For simplicity, we won't allow this to be disabled. It also can't 4023 * For simplicity, we won't allow this to be disabled. It also can't
4012 * be changed if the cgroup has children already, or if tasks had 4024 * be changed if the cgroup has children already, or if tasks had
4013 * already joined. 4025 * already joined.
4014 * 4026 *
4015 * If tasks join before we set the limit, a person looking at 4027 * If tasks join before we set the limit, a person looking at
4016 * kmem.usage_in_bytes will have no way to determine when it took 4028 * kmem.usage_in_bytes will have no way to determine when it took
4017 * place, which makes the value quite meaningless. 4029 * place, which makes the value quite meaningless.
4018 * 4030 *
4019 * After it first became limited, changes in the value of the limit are 4031 * After it first became limited, changes in the value of the limit are
4020 * of course permitted. 4032 * of course permitted.
4021 */ 4033 */
4022 mutex_lock(&memcg_create_mutex); 4034 mutex_lock(&memcg_create_mutex);
4023 if (cgroup_has_tasks(memcg->css.cgroup) || 4035 if (cgroup_has_tasks(memcg->css.cgroup) ||
4024 (memcg->use_hierarchy && memcg_has_children(memcg))) 4036 (memcg->use_hierarchy && memcg_has_children(memcg)))
4025 err = -EBUSY; 4037 err = -EBUSY;
4026 mutex_unlock(&memcg_create_mutex); 4038 mutex_unlock(&memcg_create_mutex);
4027 if (err) 4039 if (err)
4028 goto out; 4040 goto out;
4029 4041
4030 memcg_id = memcg_alloc_cache_id(); 4042 memcg_id = memcg_alloc_cache_id();
4031 if (memcg_id < 0) { 4043 if (memcg_id < 0) {
4032 err = memcg_id; 4044 err = memcg_id;
4033 goto out; 4045 goto out;
4034 } 4046 }
4035 4047
4036 memcg->kmemcg_id = memcg_id; 4048 memcg->kmemcg_id = memcg_id;
4037 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 4049 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
4038 4050
4039 /* 4051 /*
4040 * We couldn't have accounted to this cgroup, because it hasn't got the 4052 * We couldn't have accounted to this cgroup, because it hasn't got the
4041 * active bit set yet, so this should succeed. 4053 * active bit set yet, so this should succeed.
4042 */ 4054 */
4043 err = page_counter_limit(&memcg->kmem, nr_pages); 4055 err = page_counter_limit(&memcg->kmem, nr_pages);
4044 VM_BUG_ON(err); 4056 VM_BUG_ON(err);
4045 4057
4046 static_key_slow_inc(&memcg_kmem_enabled_key); 4058 static_key_slow_inc(&memcg_kmem_enabled_key);
4047 /* 4059 /*
4048 * Setting the active bit after enabling static branching will 4060 * Setting the active bit after enabling static branching will
4049 * guarantee no one starts accounting before all call sites are 4061 * guarantee no one starts accounting before all call sites are
4050 * patched. 4062 * patched.
4051 */ 4063 */
4052 memcg_kmem_set_active(memcg); 4064 memcg_kmem_set_active(memcg);
4053 out: 4065 out:
4054 memcg_resume_kmem_account(); 4066 memcg_resume_kmem_account();
4055 return err; 4067 return err;
4056 } 4068 }
4057 4069
4058 static int memcg_activate_kmem(struct mem_cgroup *memcg, 4070 static int memcg_activate_kmem(struct mem_cgroup *memcg,
4059 unsigned long nr_pages) 4071 unsigned long nr_pages)
4060 { 4072 {
4061 int ret; 4073 int ret;
4062 4074
4063 mutex_lock(&activate_kmem_mutex); 4075 mutex_lock(&activate_kmem_mutex);
4064 ret = __memcg_activate_kmem(memcg, nr_pages); 4076 ret = __memcg_activate_kmem(memcg, nr_pages);
4065 mutex_unlock(&activate_kmem_mutex); 4077 mutex_unlock(&activate_kmem_mutex);
4066 return ret; 4078 return ret;
4067 } 4079 }
4068 4080
4069 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4081 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4070 unsigned long limit) 4082 unsigned long limit)
4071 { 4083 {
4072 int ret; 4084 int ret;
4073 4085
4074 mutex_lock(&memcg_limit_mutex); 4086 mutex_lock(&memcg_limit_mutex);
4075 if (!memcg_kmem_is_active(memcg)) 4087 if (!memcg_kmem_is_active(memcg))
4076 ret = memcg_activate_kmem(memcg, limit); 4088 ret = memcg_activate_kmem(memcg, limit);
4077 else 4089 else
4078 ret = page_counter_limit(&memcg->kmem, limit); 4090 ret = page_counter_limit(&memcg->kmem, limit);
4079 mutex_unlock(&memcg_limit_mutex); 4091 mutex_unlock(&memcg_limit_mutex);
4080 return ret; 4092 return ret;
4081 } 4093 }
4082 4094
4083 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 4095 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
4084 { 4096 {
4085 int ret = 0; 4097 int ret = 0;
4086 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 4098 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
4087 4099
4088 if (!parent) 4100 if (!parent)
4089 return 0; 4101 return 0;
4090 4102
4091 mutex_lock(&activate_kmem_mutex); 4103 mutex_lock(&activate_kmem_mutex);
4092 /* 4104 /*
4093 * If the parent cgroup is not kmem-active now, it cannot be activated 4105 * If the parent cgroup is not kmem-active now, it cannot be activated
4094 * after this point, because it has at least one child already. 4106 * after this point, because it has at least one child already.
4095 */ 4107 */
4096 if (memcg_kmem_is_active(parent)) 4108 if (memcg_kmem_is_active(parent))
4097 ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); 4109 ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
4098 mutex_unlock(&activate_kmem_mutex); 4110 mutex_unlock(&activate_kmem_mutex);
4099 return ret; 4111 return ret;
4100 } 4112 }
4101 #else 4113 #else
4102 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 4114 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
4103 unsigned long limit) 4115 unsigned long limit)
4104 { 4116 {
4105 return -EINVAL; 4117 return -EINVAL;
4106 } 4118 }
4107 #endif /* CONFIG_MEMCG_KMEM */ 4119 #endif /* CONFIG_MEMCG_KMEM */
4108 4120
4109 /* 4121 /*
4110 * The user of this function is... 4122 * The user of this function is...
4111 * RES_LIMIT. 4123 * RES_LIMIT.
4112 */ 4124 */
4113 static ssize_t mem_cgroup_write(struct kernfs_open_file *of, 4125 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
4114 char *buf, size_t nbytes, loff_t off) 4126 char *buf, size_t nbytes, loff_t off)
4115 { 4127 {
4116 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4128 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4117 unsigned long nr_pages; 4129 unsigned long nr_pages;
4118 int ret; 4130 int ret;
4119 4131
4120 buf = strstrip(buf); 4132 buf = strstrip(buf);
4121 ret = page_counter_memparse(buf, &nr_pages); 4133 ret = page_counter_memparse(buf, &nr_pages);
4122 if (ret) 4134 if (ret)
4123 return ret; 4135 return ret;
4124 4136
4125 switch (MEMFILE_ATTR(of_cft(of)->private)) { 4137 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4126 case RES_LIMIT: 4138 case RES_LIMIT:
4127 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4139 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
4128 ret = -EINVAL; 4140 ret = -EINVAL;
4129 break; 4141 break;
4130 } 4142 }
4131 switch (MEMFILE_TYPE(of_cft(of)->private)) { 4143 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4132 case _MEM: 4144 case _MEM:
4133 ret = mem_cgroup_resize_limit(memcg, nr_pages); 4145 ret = mem_cgroup_resize_limit(memcg, nr_pages);
4134 break; 4146 break;
4135 case _MEMSWAP: 4147 case _MEMSWAP:
4136 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); 4148 ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
4137 break; 4149 break;
4138 case _KMEM: 4150 case _KMEM:
4139 ret = memcg_update_kmem_limit(memcg, nr_pages); 4151 ret = memcg_update_kmem_limit(memcg, nr_pages);
4140 break; 4152 break;
4141 } 4153 }
4142 break; 4154 break;
4143 case RES_SOFT_LIMIT: 4155 case RES_SOFT_LIMIT:
4144 memcg->soft_limit = nr_pages; 4156 memcg->soft_limit = nr_pages;
4145 ret = 0; 4157 ret = 0;
4146 break; 4158 break;
4147 } 4159 }
4148 return ret ?: nbytes; 4160 return ret ?: nbytes;
4149 } 4161 }
4150 4162
4151 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, 4163 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
4152 size_t nbytes, loff_t off) 4164 size_t nbytes, loff_t off)
4153 { 4165 {
4154 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4166 struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
4155 struct page_counter *counter; 4167 struct page_counter *counter;
4156 4168
4157 switch (MEMFILE_TYPE(of_cft(of)->private)) { 4169 switch (MEMFILE_TYPE(of_cft(of)->private)) {
4158 case _MEM: 4170 case _MEM:
4159 counter = &memcg->memory; 4171 counter = &memcg->memory;
4160 break; 4172 break;
4161 case _MEMSWAP: 4173 case _MEMSWAP:
4162 counter = &memcg->memsw; 4174 counter = &memcg->memsw;
4163 break; 4175 break;
4164 case _KMEM: 4176 case _KMEM:
4165 counter = &memcg->kmem; 4177 counter = &memcg->kmem;
4166 break; 4178 break;
4167 default: 4179 default:
4168 BUG(); 4180 BUG();
4169 } 4181 }
4170 4182
4171 switch (MEMFILE_ATTR(of_cft(of)->private)) { 4183 switch (MEMFILE_ATTR(of_cft(of)->private)) {
4172 case RES_MAX_USAGE: 4184 case RES_MAX_USAGE:
4173 page_counter_reset_watermark(counter); 4185 page_counter_reset_watermark(counter);
4174 break; 4186 break;
4175 case RES_FAILCNT: 4187 case RES_FAILCNT:
4176 counter->failcnt = 0; 4188 counter->failcnt = 0;
4177 break; 4189 break;
4178 default: 4190 default:
4179 BUG(); 4191 BUG();
4180 } 4192 }
4181 4193
4182 return nbytes; 4194 return nbytes;
4183 } 4195 }
4184 4196
4185 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 4197 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
4186 struct cftype *cft) 4198 struct cftype *cft)
4187 { 4199 {
4188 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 4200 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
4189 } 4201 }
4190 4202
4191 #ifdef CONFIG_MMU 4203 #ifdef CONFIG_MMU
4192 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4204 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
4193 struct cftype *cft, u64 val) 4205 struct cftype *cft, u64 val)
4194 { 4206 {
4195 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4207 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4196 4208
4197 if (val >= (1 << NR_MOVE_TYPE)) 4209 if (val >= (1 << NR_MOVE_TYPE))
4198 return -EINVAL; 4210 return -EINVAL;
4199 4211
4200 /* 4212 /*
4201 * No kind of locking is needed in here, because ->can_attach() will 4213 * No kind of locking is needed in here, because ->can_attach() will
4202 * check this value once in the beginning of the process, and then carry 4214 * check this value once in the beginning of the process, and then carry
4203 * on with stale data. This means that changes to this value will only 4215 * on with stale data. This means that changes to this value will only
4204 * affect task migrations starting after the change. 4216 * affect task migrations starting after the change.
4205 */ 4217 */
4206 memcg->move_charge_at_immigrate = val; 4218 memcg->move_charge_at_immigrate = val;
4207 return 0; 4219 return 0;
4208 } 4220 }
4209 #else 4221 #else
4210 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 4222 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
4211 struct cftype *cft, u64 val) 4223 struct cftype *cft, u64 val)
4212 { 4224 {
4213 return -ENOSYS; 4225 return -ENOSYS;
4214 } 4226 }
4215 #endif 4227 #endif
4216 4228
4217 #ifdef CONFIG_NUMA 4229 #ifdef CONFIG_NUMA
4218 static int memcg_numa_stat_show(struct seq_file *m, void *v) 4230 static int memcg_numa_stat_show(struct seq_file *m, void *v)
4219 { 4231 {
4220 struct numa_stat { 4232 struct numa_stat {
4221 const char *name; 4233 const char *name;
4222 unsigned int lru_mask; 4234 unsigned int lru_mask;
4223 }; 4235 };
4224 4236
4225 static const struct numa_stat stats[] = { 4237 static const struct numa_stat stats[] = {
4226 { "total", LRU_ALL }, 4238 { "total", LRU_ALL },
4227 { "file", LRU_ALL_FILE }, 4239 { "file", LRU_ALL_FILE },
4228 { "anon", LRU_ALL_ANON }, 4240 { "anon", LRU_ALL_ANON },
4229 { "unevictable", BIT(LRU_UNEVICTABLE) }, 4241 { "unevictable", BIT(LRU_UNEVICTABLE) },
4230 }; 4242 };
4231 const struct numa_stat *stat; 4243 const struct numa_stat *stat;
4232 int nid; 4244 int nid;
4233 unsigned long nr; 4245 unsigned long nr;
4234 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4246 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4235 4247
4236 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4248 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4237 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 4249 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
4238 seq_printf(m, "%s=%lu", stat->name, nr); 4250 seq_printf(m, "%s=%lu", stat->name, nr);
4239 for_each_node_state(nid, N_MEMORY) { 4251 for_each_node_state(nid, N_MEMORY) {
4240 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 4252 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
4241 stat->lru_mask); 4253 stat->lru_mask);
4242 seq_printf(m, " N%d=%lu", nid, nr); 4254 seq_printf(m, " N%d=%lu", nid, nr);
4243 } 4255 }
4244 seq_putc(m, '\n'); 4256 seq_putc(m, '\n');
4245 } 4257 }
4246 4258
4247 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 4259 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
4248 struct mem_cgroup *iter; 4260 struct mem_cgroup *iter;
4249 4261
4250 nr = 0; 4262 nr = 0;
4251 for_each_mem_cgroup_tree(iter, memcg) 4263 for_each_mem_cgroup_tree(iter, memcg)
4252 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 4264 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
4253 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 4265 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
4254 for_each_node_state(nid, N_MEMORY) { 4266 for_each_node_state(nid, N_MEMORY) {
4255 nr = 0; 4267 nr = 0;
4256 for_each_mem_cgroup_tree(iter, memcg) 4268 for_each_mem_cgroup_tree(iter, memcg)
4257 nr += mem_cgroup_node_nr_lru_pages( 4269 nr += mem_cgroup_node_nr_lru_pages(
4258 iter, nid, stat->lru_mask); 4270 iter, nid, stat->lru_mask);
4259 seq_printf(m, " N%d=%lu", nid, nr); 4271 seq_printf(m, " N%d=%lu", nid, nr);
4260 } 4272 }
4261 seq_putc(m, '\n'); 4273 seq_putc(m, '\n');
4262 } 4274 }
4263 4275
4264 return 0; 4276 return 0;
4265 } 4277 }
4266 #endif /* CONFIG_NUMA */ 4278 #endif /* CONFIG_NUMA */
4267 4279
4268 static inline void mem_cgroup_lru_names_not_uptodate(void) 4280 static inline void mem_cgroup_lru_names_not_uptodate(void)
4269 { 4281 {
4270 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 4282 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4271 } 4283 }
4272 4284
4273 static int memcg_stat_show(struct seq_file *m, void *v) 4285 static int memcg_stat_show(struct seq_file *m, void *v)
4274 { 4286 {
4275 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 4287 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
4276 unsigned long memory, memsw; 4288 unsigned long memory, memsw;
4277 struct mem_cgroup *mi; 4289 struct mem_cgroup *mi;
4278 unsigned int i; 4290 unsigned int i;
4279 4291
4280 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4292 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4281 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 4293 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4282 continue; 4294 continue;
4283 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 4295 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4284 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 4296 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4285 } 4297 }
4286 4298
4287 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 4299 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4288 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 4300 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4289 mem_cgroup_read_events(memcg, i)); 4301 mem_cgroup_read_events(memcg, i));
4290 4302
4291 for (i = 0; i < NR_LRU_LISTS; i++) 4303 for (i = 0; i < NR_LRU_LISTS; i++)
4292 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 4304 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4293 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 4305 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4294 4306
4295 /* Hierarchical information */ 4307 /* Hierarchical information */
4296 memory = memsw = PAGE_COUNTER_MAX; 4308 memory = memsw = PAGE_COUNTER_MAX;
4297 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { 4309 for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4298 memory = min(memory, mi->memory.limit); 4310 memory = min(memory, mi->memory.limit);
4299 memsw = min(memsw, mi->memsw.limit); 4311 memsw = min(memsw, mi->memsw.limit);
4300 } 4312 }
4301 seq_printf(m, "hierarchical_memory_limit %llu\n", 4313 seq_printf(m, "hierarchical_memory_limit %llu\n",
4302 (u64)memory * PAGE_SIZE); 4314 (u64)memory * PAGE_SIZE);
4303 if (do_swap_account) 4315 if (do_swap_account)
4304 seq_printf(m, "hierarchical_memsw_limit %llu\n", 4316 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4305 (u64)memsw * PAGE_SIZE); 4317 (u64)memsw * PAGE_SIZE);
4306 4318
4307 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 4319 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4308 long long val = 0; 4320 long long val = 0;
4309 4321
4310 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 4322 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
4311 continue; 4323 continue;
4312 for_each_mem_cgroup_tree(mi, memcg) 4324 for_each_mem_cgroup_tree(mi, memcg)
4313 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 4325 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4314 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 4326 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4315 } 4327 }
4316 4328
4317 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 4329 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4318 unsigned long long val = 0; 4330 unsigned long long val = 0;
4319 4331
4320 for_each_mem_cgroup_tree(mi, memcg) 4332 for_each_mem_cgroup_tree(mi, memcg)
4321 val += mem_cgroup_read_events(mi, i); 4333 val += mem_cgroup_read_events(mi, i);
4322 seq_printf(m, "total_%s %llu\n", 4334 seq_printf(m, "total_%s %llu\n",
4323 mem_cgroup_events_names[i], val); 4335 mem_cgroup_events_names[i], val);
4324 } 4336 }
4325 4337
4326 for (i = 0; i < NR_LRU_LISTS; i++) { 4338 for (i = 0; i < NR_LRU_LISTS; i++) {
4327 unsigned long long val = 0; 4339 unsigned long long val = 0;
4328 4340
4329 for_each_mem_cgroup_tree(mi, memcg) 4341 for_each_mem_cgroup_tree(mi, memcg)
4330 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 4342 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4331 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 4343 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4332 } 4344 }
4333 4345
4334 #ifdef CONFIG_DEBUG_VM 4346 #ifdef CONFIG_DEBUG_VM
4335 { 4347 {
4336 int nid, zid; 4348 int nid, zid;
4337 struct mem_cgroup_per_zone *mz; 4349 struct mem_cgroup_per_zone *mz;
4338 struct zone_reclaim_stat *rstat; 4350 struct zone_reclaim_stat *rstat;
4339 unsigned long recent_rotated[2] = {0, 0}; 4351 unsigned long recent_rotated[2] = {0, 0};
4340 unsigned long recent_scanned[2] = {0, 0}; 4352 unsigned long recent_scanned[2] = {0, 0};
4341 4353
4342 for_each_online_node(nid) 4354 for_each_online_node(nid)
4343 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4355 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4344 mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; 4356 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
4345 rstat = &mz->lruvec.reclaim_stat; 4357 rstat = &mz->lruvec.reclaim_stat;
4346 4358
4347 recent_rotated[0] += rstat->recent_rotated[0]; 4359 recent_rotated[0] += rstat->recent_rotated[0];
4348 recent_rotated[1] += rstat->recent_rotated[1]; 4360 recent_rotated[1] += rstat->recent_rotated[1];
4349 recent_scanned[0] += rstat->recent_scanned[0]; 4361 recent_scanned[0] += rstat->recent_scanned[0];
4350 recent_scanned[1] += rstat->recent_scanned[1]; 4362 recent_scanned[1] += rstat->recent_scanned[1];
4351 } 4363 }
4352 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 4364 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4353 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 4365 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4354 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 4366 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4355 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 4367 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4356 } 4368 }
4357 #endif 4369 #endif
4358 4370
4359 return 0; 4371 return 0;
4360 } 4372 }
4361 4373
4362 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 4374 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4363 struct cftype *cft) 4375 struct cftype *cft)
4364 { 4376 {
4365 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4377 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4366 4378
4367 return mem_cgroup_swappiness(memcg); 4379 return mem_cgroup_swappiness(memcg);
4368 } 4380 }
4369 4381
4370 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 4382 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4371 struct cftype *cft, u64 val) 4383 struct cftype *cft, u64 val)
4372 { 4384 {
4373 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4385 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4374 4386
4375 if (val > 100) 4387 if (val > 100)
4376 return -EINVAL; 4388 return -EINVAL;
4377 4389
4378 if (css->parent) 4390 if (css->parent)
4379 memcg->swappiness = val; 4391 memcg->swappiness = val;
4380 else 4392 else
4381 vm_swappiness = val; 4393 vm_swappiness = val;
4382 4394
4383 return 0; 4395 return 0;
4384 } 4396 }
4385 4397
4386 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4398 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4387 { 4399 {
4388 struct mem_cgroup_threshold_ary *t; 4400 struct mem_cgroup_threshold_ary *t;
4389 unsigned long usage; 4401 unsigned long usage;
4390 int i; 4402 int i;
4391 4403
4392 rcu_read_lock(); 4404 rcu_read_lock();
4393 if (!swap) 4405 if (!swap)
4394 t = rcu_dereference(memcg->thresholds.primary); 4406 t = rcu_dereference(memcg->thresholds.primary);
4395 else 4407 else
4396 t = rcu_dereference(memcg->memsw_thresholds.primary); 4408 t = rcu_dereference(memcg->memsw_thresholds.primary);
4397 4409
4398 if (!t) 4410 if (!t)
4399 goto unlock; 4411 goto unlock;
4400 4412
4401 usage = mem_cgroup_usage(memcg, swap); 4413 usage = mem_cgroup_usage(memcg, swap);
4402 4414
4403 /* 4415 /*
4404 * current_threshold points to threshold just below or equal to usage. 4416 * current_threshold points to threshold just below or equal to usage.
4405 * If it's not true, a threshold was crossed after last 4417 * If it's not true, a threshold was crossed after last
4406 * call of __mem_cgroup_threshold(). 4418 * call of __mem_cgroup_threshold().
4407 */ 4419 */
4408 i = t->current_threshold; 4420 i = t->current_threshold;
4409 4421
4410 /* 4422 /*
4411 * Iterate backward over array of thresholds starting from 4423 * Iterate backward over array of thresholds starting from
4412 * current_threshold and check if a threshold is crossed. 4424 * current_threshold and check if a threshold is crossed.
4413 * If none of thresholds below usage is crossed, we read 4425 * If none of thresholds below usage is crossed, we read
4414 * only one element of the array here. 4426 * only one element of the array here.
4415 */ 4427 */
4416 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4428 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4417 eventfd_signal(t->entries[i].eventfd, 1); 4429 eventfd_signal(t->entries[i].eventfd, 1);
4418 4430
4419 /* i = current_threshold + 1 */ 4431 /* i = current_threshold + 1 */
4420 i++; 4432 i++;
4421 4433
4422 /* 4434 /*
4423 * Iterate forward over array of thresholds starting from 4435 * Iterate forward over array of thresholds starting from
4424 * current_threshold+1 and check if a threshold is crossed. 4436 * current_threshold+1 and check if a threshold is crossed.
4425 * If none of thresholds above usage is crossed, we read 4437 * If none of thresholds above usage is crossed, we read
4426 * only one element of the array here. 4438 * only one element of the array here.
4427 */ 4439 */
4428 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4440 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4429 eventfd_signal(t->entries[i].eventfd, 1); 4441 eventfd_signal(t->entries[i].eventfd, 1);
4430 4442
4431 /* Update current_threshold */ 4443 /* Update current_threshold */
4432 t->current_threshold = i - 1; 4444 t->current_threshold = i - 1;
4433 unlock: 4445 unlock:
4434 rcu_read_unlock(); 4446 rcu_read_unlock();
4435 } 4447 }
4436 4448
4437 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4449 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4438 { 4450 {
4439 while (memcg) { 4451 while (memcg) {
4440 __mem_cgroup_threshold(memcg, false); 4452 __mem_cgroup_threshold(memcg, false);
4441 if (do_swap_account) 4453 if (do_swap_account)
4442 __mem_cgroup_threshold(memcg, true); 4454 __mem_cgroup_threshold(memcg, true);
4443 4455
4444 memcg = parent_mem_cgroup(memcg); 4456 memcg = parent_mem_cgroup(memcg);
4445 } 4457 }
4446 } 4458 }
4447 4459
4448 static int compare_thresholds(const void *a, const void *b) 4460 static int compare_thresholds(const void *a, const void *b)
4449 { 4461 {
4450 const struct mem_cgroup_threshold *_a = a; 4462 const struct mem_cgroup_threshold *_a = a;
4451 const struct mem_cgroup_threshold *_b = b; 4463 const struct mem_cgroup_threshold *_b = b;
4452 4464
4453 if (_a->threshold > _b->threshold) 4465 if (_a->threshold > _b->threshold)
4454 return 1; 4466 return 1;
4455 4467
4456 if (_a->threshold < _b->threshold) 4468 if (_a->threshold < _b->threshold)
4457 return -1; 4469 return -1;
4458 4470
4459 return 0; 4471 return 0;
4460 } 4472 }
4461 4473
4462 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 4474 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4463 { 4475 {
4464 struct mem_cgroup_eventfd_list *ev; 4476 struct mem_cgroup_eventfd_list *ev;
4465 4477
4466 spin_lock(&memcg_oom_lock); 4478 spin_lock(&memcg_oom_lock);
4467 4479
4468 list_for_each_entry(ev, &memcg->oom_notify, list) 4480 list_for_each_entry(ev, &memcg->oom_notify, list)
4469 eventfd_signal(ev->eventfd, 1); 4481 eventfd_signal(ev->eventfd, 1);
4470 4482
4471 spin_unlock(&memcg_oom_lock); 4483 spin_unlock(&memcg_oom_lock);
4472 return 0; 4484 return 0;
4473 } 4485 }
4474 4486
4475 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 4487 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4476 { 4488 {
4477 struct mem_cgroup *iter; 4489 struct mem_cgroup *iter;
4478 4490
4479 for_each_mem_cgroup_tree(iter, memcg) 4491 for_each_mem_cgroup_tree(iter, memcg)
4480 mem_cgroup_oom_notify_cb(iter); 4492 mem_cgroup_oom_notify_cb(iter);
4481 } 4493 }
4482 4494
4483 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4495 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4484 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 4496 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4485 { 4497 {
4486 struct mem_cgroup_thresholds *thresholds; 4498 struct mem_cgroup_thresholds *thresholds;
4487 struct mem_cgroup_threshold_ary *new; 4499 struct mem_cgroup_threshold_ary *new;
4488 unsigned long threshold; 4500 unsigned long threshold;
4489 unsigned long usage; 4501 unsigned long usage;
4490 int i, size, ret; 4502 int i, size, ret;
4491 4503
4492 ret = page_counter_memparse(args, &threshold); 4504 ret = page_counter_memparse(args, &threshold);
4493 if (ret) 4505 if (ret)
4494 return ret; 4506 return ret;
4495 4507
4496 mutex_lock(&memcg->thresholds_lock); 4508 mutex_lock(&memcg->thresholds_lock);
4497 4509
4498 if (type == _MEM) { 4510 if (type == _MEM) {
4499 thresholds = &memcg->thresholds; 4511 thresholds = &memcg->thresholds;
4500 usage = mem_cgroup_usage(memcg, false); 4512 usage = mem_cgroup_usage(memcg, false);
4501 } else if (type == _MEMSWAP) { 4513 } else if (type == _MEMSWAP) {
4502 thresholds = &memcg->memsw_thresholds; 4514 thresholds = &memcg->memsw_thresholds;
4503 usage = mem_cgroup_usage(memcg, true); 4515 usage = mem_cgroup_usage(memcg, true);
4504 } else 4516 } else
4505 BUG(); 4517 BUG();
4506 4518
4507 /* Check if a threshold crossed before adding a new one */ 4519 /* Check if a threshold crossed before adding a new one */
4508 if (thresholds->primary) 4520 if (thresholds->primary)
4509 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4521 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4510 4522
4511 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4523 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4512 4524
4513 /* Allocate memory for new array of thresholds */ 4525 /* Allocate memory for new array of thresholds */
4514 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4526 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4515 GFP_KERNEL); 4527 GFP_KERNEL);
4516 if (!new) { 4528 if (!new) {
4517 ret = -ENOMEM; 4529 ret = -ENOMEM;
4518 goto unlock; 4530 goto unlock;
4519 } 4531 }
4520 new->size = size; 4532 new->size = size;
4521 4533
4522 /* Copy thresholds (if any) to new array */ 4534 /* Copy thresholds (if any) to new array */
4523 if (thresholds->primary) { 4535 if (thresholds->primary) {
4524 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4536 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4525 sizeof(struct mem_cgroup_threshold)); 4537 sizeof(struct mem_cgroup_threshold));
4526 } 4538 }
4527 4539
4528 /* Add new threshold */ 4540 /* Add new threshold */
4529 new->entries[size - 1].eventfd = eventfd; 4541 new->entries[size - 1].eventfd = eventfd;
4530 new->entries[size - 1].threshold = threshold; 4542 new->entries[size - 1].threshold = threshold;
4531 4543
4532 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4544 /* Sort thresholds. Registering of new threshold isn't time-critical */
4533 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4545 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4534 compare_thresholds, NULL); 4546 compare_thresholds, NULL);
4535 4547
4536 /* Find current threshold */ 4548 /* Find current threshold */
4537 new->current_threshold = -1; 4549 new->current_threshold = -1;
4538 for (i = 0; i < size; i++) { 4550 for (i = 0; i < size; i++) {
4539 if (new->entries[i].threshold <= usage) { 4551 if (new->entries[i].threshold <= usage) {
4540 /* 4552 /*
4541 * new->current_threshold will not be used until 4553 * new->current_threshold will not be used until
4542 * rcu_assign_pointer(), so it's safe to increment 4554 * rcu_assign_pointer(), so it's safe to increment
4543 * it here. 4555 * it here.
4544 */ 4556 */
4545 ++new->current_threshold; 4557 ++new->current_threshold;
4546 } else 4558 } else
4547 break; 4559 break;
4548 } 4560 }
4549 4561
4550 /* Free old spare buffer and save old primary buffer as spare */ 4562 /* Free old spare buffer and save old primary buffer as spare */
4551 kfree(thresholds->spare); 4563 kfree(thresholds->spare);
4552 thresholds->spare = thresholds->primary; 4564 thresholds->spare = thresholds->primary;
4553 4565
4554 rcu_assign_pointer(thresholds->primary, new); 4566 rcu_assign_pointer(thresholds->primary, new);
4555 4567
4556 /* To be sure that nobody uses thresholds */ 4568 /* To be sure that nobody uses thresholds */
4557 synchronize_rcu(); 4569 synchronize_rcu();
4558 4570
4559 unlock: 4571 unlock:
4560 mutex_unlock(&memcg->thresholds_lock); 4572 mutex_unlock(&memcg->thresholds_lock);
4561 4573
4562 return ret; 4574 return ret;
4563 } 4575 }
4564 4576
4565 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 4577 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4566 struct eventfd_ctx *eventfd, const char *args) 4578 struct eventfd_ctx *eventfd, const char *args)
4567 { 4579 {
4568 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 4580 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4569 } 4581 }
4570 4582
4571 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 4583 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4572 struct eventfd_ctx *eventfd, const char *args) 4584 struct eventfd_ctx *eventfd, const char *args)
4573 { 4585 {
4574 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 4586 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4575 } 4587 }
4576 4588
4577 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4589 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4578 struct eventfd_ctx *eventfd, enum res_type type) 4590 struct eventfd_ctx *eventfd, enum res_type type)
4579 { 4591 {
4580 struct mem_cgroup_thresholds *thresholds; 4592 struct mem_cgroup_thresholds *thresholds;
4581 struct mem_cgroup_threshold_ary *new; 4593 struct mem_cgroup_threshold_ary *new;
4582 unsigned long usage; 4594 unsigned long usage;
4583 int i, j, size; 4595 int i, j, size;
4584 4596
4585 mutex_lock(&memcg->thresholds_lock); 4597 mutex_lock(&memcg->thresholds_lock);
4586 4598
4587 if (type == _MEM) { 4599 if (type == _MEM) {
4588 thresholds = &memcg->thresholds; 4600 thresholds = &memcg->thresholds;
4589 usage = mem_cgroup_usage(memcg, false); 4601 usage = mem_cgroup_usage(memcg, false);
4590 } else if (type == _MEMSWAP) { 4602 } else if (type == _MEMSWAP) {
4591 thresholds = &memcg->memsw_thresholds; 4603 thresholds = &memcg->memsw_thresholds;
4592 usage = mem_cgroup_usage(memcg, true); 4604 usage = mem_cgroup_usage(memcg, true);
4593 } else 4605 } else
4594 BUG(); 4606 BUG();
4595 4607
4596 if (!thresholds->primary) 4608 if (!thresholds->primary)
4597 goto unlock; 4609 goto unlock;
4598 4610
4599 /* Check if a threshold crossed before removing */ 4611 /* Check if a threshold crossed before removing */
4600 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4612 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4601 4613
4602 /* Calculate new number of threshold */ 4614 /* Calculate new number of threshold */
4603 size = 0; 4615 size = 0;
4604 for (i = 0; i < thresholds->primary->size; i++) { 4616 for (i = 0; i < thresholds->primary->size; i++) {
4605 if (thresholds->primary->entries[i].eventfd != eventfd) 4617 if (thresholds->primary->entries[i].eventfd != eventfd)
4606 size++; 4618 size++;
4607 } 4619 }
4608 4620
4609 new = thresholds->spare; 4621 new = thresholds->spare;
4610 4622
4611 /* Set thresholds array to NULL if we don't have thresholds */ 4623 /* Set thresholds array to NULL if we don't have thresholds */
4612 if (!size) { 4624 if (!size) {
4613 kfree(new); 4625 kfree(new);
4614 new = NULL; 4626 new = NULL;
4615 goto swap_buffers; 4627 goto swap_buffers;
4616 } 4628 }
4617 4629
4618 new->size = size; 4630 new->size = size;
4619 4631
4620 /* Copy thresholds and find current threshold */ 4632 /* Copy thresholds and find current threshold */
4621 new->current_threshold = -1; 4633 new->current_threshold = -1;
4622 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4634 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4623 if (thresholds->primary->entries[i].eventfd == eventfd) 4635 if (thresholds->primary->entries[i].eventfd == eventfd)
4624 continue; 4636 continue;
4625 4637
4626 new->entries[j] = thresholds->primary->entries[i]; 4638 new->entries[j] = thresholds->primary->entries[i];
4627 if (new->entries[j].threshold <= usage) { 4639 if (new->entries[j].threshold <= usage) {
4628 /* 4640 /*
4629 * new->current_threshold will not be used 4641 * new->current_threshold will not be used
4630 * until rcu_assign_pointer(), so it's safe to increment 4642 * until rcu_assign_pointer(), so it's safe to increment
4631 * it here. 4643 * it here.
4632 */ 4644 */
4633 ++new->current_threshold; 4645 ++new->current_threshold;
4634 } 4646 }
4635 j++; 4647 j++;
4636 } 4648 }
4637 4649
4638 swap_buffers: 4650 swap_buffers:
4639 /* Swap primary and spare array */ 4651 /* Swap primary and spare array */
4640 thresholds->spare = thresholds->primary; 4652 thresholds->spare = thresholds->primary;
4641 /* If all events are unregistered, free the spare array */ 4653 /* If all events are unregistered, free the spare array */
4642 if (!new) { 4654 if (!new) {
4643 kfree(thresholds->spare); 4655 kfree(thresholds->spare);
4644 thresholds->spare = NULL; 4656 thresholds->spare = NULL;
4645 } 4657 }
4646 4658
4647 rcu_assign_pointer(thresholds->primary, new); 4659 rcu_assign_pointer(thresholds->primary, new);
4648 4660
4649 /* To be sure that nobody uses thresholds */ 4661 /* To be sure that nobody uses thresholds */
4650 synchronize_rcu(); 4662 synchronize_rcu();
4651 unlock: 4663 unlock:
4652 mutex_unlock(&memcg->thresholds_lock); 4664 mutex_unlock(&memcg->thresholds_lock);
4653 } 4665 }
4654 4666
4655 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4667 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4656 struct eventfd_ctx *eventfd) 4668 struct eventfd_ctx *eventfd)
4657 { 4669 {
4658 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 4670 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4659 } 4671 }
4660 4672
4661 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 4673 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4662 struct eventfd_ctx *eventfd) 4674 struct eventfd_ctx *eventfd)
4663 { 4675 {
4664 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 4676 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4665 } 4677 }
4666 4678
4667 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 4679 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4668 struct eventfd_ctx *eventfd, const char *args) 4680 struct eventfd_ctx *eventfd, const char *args)
4669 { 4681 {
4670 struct mem_cgroup_eventfd_list *event; 4682 struct mem_cgroup_eventfd_list *event;
4671 4683
4672 event = kmalloc(sizeof(*event), GFP_KERNEL); 4684 event = kmalloc(sizeof(*event), GFP_KERNEL);
4673 if (!event) 4685 if (!event)
4674 return -ENOMEM; 4686 return -ENOMEM;
4675 4687
4676 spin_lock(&memcg_oom_lock); 4688 spin_lock(&memcg_oom_lock);
4677 4689
4678 event->eventfd = eventfd; 4690 event->eventfd = eventfd;
4679 list_add(&event->list, &memcg->oom_notify); 4691 list_add(&event->list, &memcg->oom_notify);
4680 4692
4681 /* already in OOM ? */ 4693 /* already in OOM ? */
4682 if (atomic_read(&memcg->under_oom)) 4694 if (atomic_read(&memcg->under_oom))
4683 eventfd_signal(eventfd, 1); 4695 eventfd_signal(eventfd, 1);
4684 spin_unlock(&memcg_oom_lock); 4696 spin_unlock(&memcg_oom_lock);
4685 4697
4686 return 0; 4698 return 0;
4687 } 4699 }
4688 4700
4689 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 4701 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4690 struct eventfd_ctx *eventfd) 4702 struct eventfd_ctx *eventfd)
4691 { 4703 {
4692 struct mem_cgroup_eventfd_list *ev, *tmp; 4704 struct mem_cgroup_eventfd_list *ev, *tmp;
4693 4705
4694 spin_lock(&memcg_oom_lock); 4706 spin_lock(&memcg_oom_lock);
4695 4707
4696 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 4708 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4697 if (ev->eventfd == eventfd) { 4709 if (ev->eventfd == eventfd) {
4698 list_del(&ev->list); 4710 list_del(&ev->list);
4699 kfree(ev); 4711 kfree(ev);
4700 } 4712 }
4701 } 4713 }
4702 4714
4703 spin_unlock(&memcg_oom_lock); 4715 spin_unlock(&memcg_oom_lock);
4704 } 4716 }
4705 4717
4706 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 4718 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4707 { 4719 {
4708 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 4720 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
4709 4721
4710 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 4722 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4711 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 4723 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
4712 return 0; 4724 return 0;
4713 } 4725 }
4714 4726
4715 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 4727 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4716 struct cftype *cft, u64 val) 4728 struct cftype *cft, u64 val)
4717 { 4729 {
4718 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4730 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4719 4731
4720 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4732 /* cannot set to root cgroup and only 0 and 1 are allowed */
4721 if (!css->parent || !((val == 0) || (val == 1))) 4733 if (!css->parent || !((val == 0) || (val == 1)))
4722 return -EINVAL; 4734 return -EINVAL;
4723 4735
4724 memcg->oom_kill_disable = val; 4736 memcg->oom_kill_disable = val;
4725 if (!val) 4737 if (!val)
4726 memcg_oom_recover(memcg); 4738 memcg_oom_recover(memcg);
4727 4739
4728 return 0; 4740 return 0;
4729 } 4741 }
4730 4742
4731 #ifdef CONFIG_MEMCG_KMEM 4743 #ifdef CONFIG_MEMCG_KMEM
4732 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4744 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4733 { 4745 {
4734 int ret; 4746 int ret;
4735 4747
4736 memcg->kmemcg_id = -1; 4748 memcg->kmemcg_id = -1;
4737 ret = memcg_propagate_kmem(memcg); 4749 ret = memcg_propagate_kmem(memcg);
4738 if (ret) 4750 if (ret)
4739 return ret; 4751 return ret;
4740 4752
4741 return mem_cgroup_sockets_init(memcg, ss); 4753 return mem_cgroup_sockets_init(memcg, ss);
4742 } 4754 }
4743 4755
4744 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4756 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4745 { 4757 {
4746 mem_cgroup_sockets_destroy(memcg); 4758 mem_cgroup_sockets_destroy(memcg);
4747 } 4759 }
4748 4760
4749 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 4761 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4750 { 4762 {
4751 if (!memcg_kmem_is_active(memcg)) 4763 if (!memcg_kmem_is_active(memcg))
4752 return; 4764 return;
4753 4765
4754 /* 4766 /*
4755 * kmem charges can outlive the cgroup. In the case of slab 4767 * kmem charges can outlive the cgroup. In the case of slab
4756 * pages, for instance, a page contain objects from various 4768 * pages, for instance, a page contain objects from various
4757 * processes. As we prevent from taking a reference for every 4769 * processes. As we prevent from taking a reference for every
4758 * such allocation we have to be careful when doing uncharge 4770 * such allocation we have to be careful when doing uncharge
4759 * (see memcg_uncharge_kmem) and here during offlining. 4771 * (see memcg_uncharge_kmem) and here during offlining.
4760 * 4772 *
4761 * The idea is that that only the _last_ uncharge which sees 4773 * The idea is that that only the _last_ uncharge which sees
4762 * the dead memcg will drop the last reference. An additional 4774 * the dead memcg will drop the last reference. An additional
4763 * reference is taken here before the group is marked dead 4775 * reference is taken here before the group is marked dead
4764 * which is then paired with css_put during uncharge resp. here. 4776 * which is then paired with css_put during uncharge resp. here.
4765 * 4777 *
4766 * Although this might sound strange as this path is called from 4778 * Although this might sound strange as this path is called from
4767 * css_offline() when the referencemight have dropped down to 0 and 4779 * css_offline() when the referencemight have dropped down to 0 and
4768 * shouldn't be incremented anymore (css_tryget_online() would 4780 * shouldn't be incremented anymore (css_tryget_online() would
4769 * fail) we do not have other options because of the kmem 4781 * fail) we do not have other options because of the kmem
4770 * allocations lifetime. 4782 * allocations lifetime.
4771 */ 4783 */
4772 css_get(&memcg->css); 4784 css_get(&memcg->css);
4773 4785
4774 memcg_kmem_mark_dead(memcg); 4786 memcg_kmem_mark_dead(memcg);
4775 4787
4776 if (page_counter_read(&memcg->kmem)) 4788 if (page_counter_read(&memcg->kmem))
4777 return; 4789 return;
4778 4790
4779 if (memcg_kmem_test_and_clear_dead(memcg)) 4791 if (memcg_kmem_test_and_clear_dead(memcg))
4780 css_put(&memcg->css); 4792 css_put(&memcg->css);
4781 } 4793 }
4782 #else 4794 #else
4783 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4795 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4784 { 4796 {
4785 return 0; 4797 return 0;
4786 } 4798 }
4787 4799
4788 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4800 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4789 { 4801 {
4790 } 4802 }
4791 4803
4792 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 4804 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
4793 { 4805 {
4794 } 4806 }
4795 #endif 4807 #endif
4796 4808
4797 /* 4809 /*
4798 * DO NOT USE IN NEW FILES. 4810 * DO NOT USE IN NEW FILES.
4799 * 4811 *
4800 * "cgroup.event_control" implementation. 4812 * "cgroup.event_control" implementation.
4801 * 4813 *
4802 * This is way over-engineered. It tries to support fully configurable 4814 * This is way over-engineered. It tries to support fully configurable
4803 * events for each user. Such level of flexibility is completely 4815 * events for each user. Such level of flexibility is completely
4804 * unnecessary especially in the light of the planned unified hierarchy. 4816 * unnecessary especially in the light of the planned unified hierarchy.
4805 * 4817 *
4806 * Please deprecate this and replace with something simpler if at all 4818 * Please deprecate this and replace with something simpler if at all
4807 * possible. 4819 * possible.
4808 */ 4820 */
4809 4821
4810 /* 4822 /*
4811 * Unregister event and free resources. 4823 * Unregister event and free resources.
4812 * 4824 *
4813 * Gets called from workqueue. 4825 * Gets called from workqueue.
4814 */ 4826 */
4815 static void memcg_event_remove(struct work_struct *work) 4827 static void memcg_event_remove(struct work_struct *work)
4816 { 4828 {
4817 struct mem_cgroup_event *event = 4829 struct mem_cgroup_event *event =
4818 container_of(work, struct mem_cgroup_event, remove); 4830 container_of(work, struct mem_cgroup_event, remove);
4819 struct mem_cgroup *memcg = event->memcg; 4831 struct mem_cgroup *memcg = event->memcg;
4820 4832
4821 remove_wait_queue(event->wqh, &event->wait); 4833 remove_wait_queue(event->wqh, &event->wait);
4822 4834
4823 event->unregister_event(memcg, event->eventfd); 4835 event->unregister_event(memcg, event->eventfd);
4824 4836
4825 /* Notify userspace the event is going away. */ 4837 /* Notify userspace the event is going away. */
4826 eventfd_signal(event->eventfd, 1); 4838 eventfd_signal(event->eventfd, 1);
4827 4839
4828 eventfd_ctx_put(event->eventfd); 4840 eventfd_ctx_put(event->eventfd);
4829 kfree(event); 4841 kfree(event);
4830 css_put(&memcg->css); 4842 css_put(&memcg->css);
4831 } 4843 }
4832 4844
4833 /* 4845 /*
4834 * Gets called on POLLHUP on eventfd when user closes it. 4846 * Gets called on POLLHUP on eventfd when user closes it.
4835 * 4847 *
4836 * Called with wqh->lock held and interrupts disabled. 4848 * Called with wqh->lock held and interrupts disabled.
4837 */ 4849 */
4838 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 4850 static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
4839 int sync, void *key) 4851 int sync, void *key)
4840 { 4852 {
4841 struct mem_cgroup_event *event = 4853 struct mem_cgroup_event *event =
4842 container_of(wait, struct mem_cgroup_event, wait); 4854 container_of(wait, struct mem_cgroup_event, wait);
4843 struct mem_cgroup *memcg = event->memcg; 4855 struct mem_cgroup *memcg = event->memcg;
4844 unsigned long flags = (unsigned long)key; 4856 unsigned long flags = (unsigned long)key;
4845 4857
4846 if (flags & POLLHUP) { 4858 if (flags & POLLHUP) {
4847 /* 4859 /*
4848 * If the event has been detached at cgroup removal, we 4860 * If the event has been detached at cgroup removal, we
4849 * can simply return knowing the other side will cleanup 4861 * can simply return knowing the other side will cleanup
4850 * for us. 4862 * for us.
4851 * 4863 *
4852 * We can't race against event freeing since the other 4864 * We can't race against event freeing since the other
4853 * side will require wqh->lock via remove_wait_queue(), 4865 * side will require wqh->lock via remove_wait_queue(),
4854 * which we hold. 4866 * which we hold.
4855 */ 4867 */
4856 spin_lock(&memcg->event_list_lock); 4868 spin_lock(&memcg->event_list_lock);
4857 if (!list_empty(&event->list)) { 4869 if (!list_empty(&event->list)) {
4858 list_del_init(&event->list); 4870 list_del_init(&event->list);
4859 /* 4871 /*
4860 * We are in atomic context, but cgroup_event_remove() 4872 * We are in atomic context, but cgroup_event_remove()
4861 * may sleep, so we have to call it in workqueue. 4873 * may sleep, so we have to call it in workqueue.
4862 */ 4874 */
4863 schedule_work(&event->remove); 4875 schedule_work(&event->remove);
4864 } 4876 }
4865 spin_unlock(&memcg->event_list_lock); 4877 spin_unlock(&memcg->event_list_lock);
4866 } 4878 }
4867 4879
4868 return 0; 4880 return 0;
4869 } 4881 }
4870 4882
4871 static void memcg_event_ptable_queue_proc(struct file *file, 4883 static void memcg_event_ptable_queue_proc(struct file *file,
4872 wait_queue_head_t *wqh, poll_table *pt) 4884 wait_queue_head_t *wqh, poll_table *pt)
4873 { 4885 {
4874 struct mem_cgroup_event *event = 4886 struct mem_cgroup_event *event =
4875 container_of(pt, struct mem_cgroup_event, pt); 4887 container_of(pt, struct mem_cgroup_event, pt);
4876 4888
4877 event->wqh = wqh; 4889 event->wqh = wqh;
4878 add_wait_queue(wqh, &event->wait); 4890 add_wait_queue(wqh, &event->wait);
4879 } 4891 }
4880 4892
4881 /* 4893 /*
4882 * DO NOT USE IN NEW FILES. 4894 * DO NOT USE IN NEW FILES.
4883 * 4895 *
4884 * Parse input and register new cgroup event handler. 4896 * Parse input and register new cgroup event handler.
4885 * 4897 *
4886 * Input must be in format '<event_fd> <control_fd> <args>'. 4898 * Input must be in format '<event_fd> <control_fd> <args>'.
4887 * Interpretation of args is defined by control file implementation. 4899 * Interpretation of args is defined by control file implementation.
4888 */ 4900 */
4889 static ssize_t memcg_write_event_control(struct kernfs_open_file *of, 4901 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4890 char *buf, size_t nbytes, loff_t off) 4902 char *buf, size_t nbytes, loff_t off)
4891 { 4903 {
4892 struct cgroup_subsys_state *css = of_css(of); 4904 struct cgroup_subsys_state *css = of_css(of);
4893 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4905 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4894 struct mem_cgroup_event *event; 4906 struct mem_cgroup_event *event;
4895 struct cgroup_subsys_state *cfile_css; 4907 struct cgroup_subsys_state *cfile_css;
4896 unsigned int efd, cfd; 4908 unsigned int efd, cfd;
4897 struct fd efile; 4909 struct fd efile;
4898 struct fd cfile; 4910 struct fd cfile;
4899 const char *name; 4911 const char *name;
4900 char *endp; 4912 char *endp;
4901 int ret; 4913 int ret;
4902 4914
4903 buf = strstrip(buf); 4915 buf = strstrip(buf);
4904 4916
4905 efd = simple_strtoul(buf, &endp, 10); 4917 efd = simple_strtoul(buf, &endp, 10);
4906 if (*endp != ' ') 4918 if (*endp != ' ')
4907 return -EINVAL; 4919 return -EINVAL;
4908 buf = endp + 1; 4920 buf = endp + 1;
4909 4921
4910 cfd = simple_strtoul(buf, &endp, 10); 4922 cfd = simple_strtoul(buf, &endp, 10);
4911 if ((*endp != ' ') && (*endp != '\0')) 4923 if ((*endp != ' ') && (*endp != '\0'))
4912 return -EINVAL; 4924 return -EINVAL;
4913 buf = endp + 1; 4925 buf = endp + 1;
4914 4926
4915 event = kzalloc(sizeof(*event), GFP_KERNEL); 4927 event = kzalloc(sizeof(*event), GFP_KERNEL);
4916 if (!event) 4928 if (!event)
4917 return -ENOMEM; 4929 return -ENOMEM;
4918 4930
4919 event->memcg = memcg; 4931 event->memcg = memcg;
4920 INIT_LIST_HEAD(&event->list); 4932 INIT_LIST_HEAD(&event->list);
4921 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 4933 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4922 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 4934 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4923 INIT_WORK(&event->remove, memcg_event_remove); 4935 INIT_WORK(&event->remove, memcg_event_remove);
4924 4936
4925 efile = fdget(efd); 4937 efile = fdget(efd);
4926 if (!efile.file) { 4938 if (!efile.file) {
4927 ret = -EBADF; 4939 ret = -EBADF;
4928 goto out_kfree; 4940 goto out_kfree;
4929 } 4941 }
4930 4942
4931 event->eventfd = eventfd_ctx_fileget(efile.file); 4943 event->eventfd = eventfd_ctx_fileget(efile.file);
4932 if (IS_ERR(event->eventfd)) { 4944 if (IS_ERR(event->eventfd)) {
4933 ret = PTR_ERR(event->eventfd); 4945 ret = PTR_ERR(event->eventfd);
4934 goto out_put_efile; 4946 goto out_put_efile;
4935 } 4947 }
4936 4948
4937 cfile = fdget(cfd); 4949 cfile = fdget(cfd);
4938 if (!cfile.file) { 4950 if (!cfile.file) {
4939 ret = -EBADF; 4951 ret = -EBADF;
4940 goto out_put_eventfd; 4952 goto out_put_eventfd;
4941 } 4953 }
4942 4954
4943 /* the process need read permission on control file */ 4955 /* the process need read permission on control file */
4944 /* AV: shouldn't we check that it's been opened for read instead? */ 4956 /* AV: shouldn't we check that it's been opened for read instead? */
4945 ret = inode_permission(file_inode(cfile.file), MAY_READ); 4957 ret = inode_permission(file_inode(cfile.file), MAY_READ);
4946 if (ret < 0) 4958 if (ret < 0)
4947 goto out_put_cfile; 4959 goto out_put_cfile;
4948 4960
4949 /* 4961 /*
4950 * Determine the event callbacks and set them in @event. This used 4962 * Determine the event callbacks and set them in @event. This used
4951 * to be done via struct cftype but cgroup core no longer knows 4963 * to be done via struct cftype but cgroup core no longer knows
4952 * about these events. The following is crude but the whole thing 4964 * about these events. The following is crude but the whole thing
4953 * is for compatibility anyway. 4965 * is for compatibility anyway.
4954 * 4966 *
4955 * DO NOT ADD NEW FILES. 4967 * DO NOT ADD NEW FILES.
4956 */ 4968 */
4957 name = cfile.file->f_dentry->d_name.name; 4969 name = cfile.file->f_dentry->d_name.name;
4958 4970
4959 if (!strcmp(name, "memory.usage_in_bytes")) { 4971 if (!strcmp(name, "memory.usage_in_bytes")) {
4960 event->register_event = mem_cgroup_usage_register_event; 4972 event->register_event = mem_cgroup_usage_register_event;
4961 event->unregister_event = mem_cgroup_usage_unregister_event; 4973 event->unregister_event = mem_cgroup_usage_unregister_event;
4962 } else if (!strcmp(name, "memory.oom_control")) { 4974 } else if (!strcmp(name, "memory.oom_control")) {
4963 event->register_event = mem_cgroup_oom_register_event; 4975 event->register_event = mem_cgroup_oom_register_event;
4964 event->unregister_event = mem_cgroup_oom_unregister_event; 4976 event->unregister_event = mem_cgroup_oom_unregister_event;
4965 } else if (!strcmp(name, "memory.pressure_level")) { 4977 } else if (!strcmp(name, "memory.pressure_level")) {
4966 event->register_event = vmpressure_register_event; 4978 event->register_event = vmpressure_register_event;
4967 event->unregister_event = vmpressure_unregister_event; 4979 event->unregister_event = vmpressure_unregister_event;
4968 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 4980 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4969 event->register_event = memsw_cgroup_usage_register_event; 4981 event->register_event = memsw_cgroup_usage_register_event;
4970 event->unregister_event = memsw_cgroup_usage_unregister_event; 4982 event->unregister_event = memsw_cgroup_usage_unregister_event;
4971 } else { 4983 } else {
4972 ret = -EINVAL; 4984 ret = -EINVAL;
4973 goto out_put_cfile; 4985 goto out_put_cfile;
4974 } 4986 }
4975 4987
4976 /* 4988 /*
4977 * Verify @cfile should belong to @css. Also, remaining events are 4989 * Verify @cfile should belong to @css. Also, remaining events are
4978 * automatically removed on cgroup destruction but the removal is 4990 * automatically removed on cgroup destruction but the removal is
4979 * asynchronous, so take an extra ref on @css. 4991 * asynchronous, so take an extra ref on @css.
4980 */ 4992 */
4981 cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, 4993 cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent,
4982 &memory_cgrp_subsys); 4994 &memory_cgrp_subsys);
4983 ret = -EINVAL; 4995 ret = -EINVAL;
4984 if (IS_ERR(cfile_css)) 4996 if (IS_ERR(cfile_css))
4985 goto out_put_cfile; 4997 goto out_put_cfile;
4986 if (cfile_css != css) { 4998 if (cfile_css != css) {
4987 css_put(cfile_css); 4999 css_put(cfile_css);
4988 goto out_put_cfile; 5000 goto out_put_cfile;
4989 } 5001 }
4990 5002
4991 ret = event->register_event(memcg, event->eventfd, buf); 5003 ret = event->register_event(memcg, event->eventfd, buf);
4992 if (ret) 5004 if (ret)
4993 goto out_put_css; 5005 goto out_put_css;
4994 5006
4995 efile.file->f_op->poll(efile.file, &event->pt); 5007 efile.file->f_op->poll(efile.file, &event->pt);
4996 5008
4997 spin_lock(&memcg->event_list_lock); 5009 spin_lock(&memcg->event_list_lock);
4998 list_add(&event->list, &memcg->event_list); 5010 list_add(&event->list, &memcg->event_list);
4999 spin_unlock(&memcg->event_list_lock); 5011 spin_unlock(&memcg->event_list_lock);
5000 5012
5001 fdput(cfile); 5013 fdput(cfile);
5002 fdput(efile); 5014 fdput(efile);
5003 5015
5004 return nbytes; 5016 return nbytes;
5005 5017
5006 out_put_css: 5018 out_put_css:
5007 css_put(css); 5019 css_put(css);
5008 out_put_cfile: 5020 out_put_cfile:
5009 fdput(cfile); 5021 fdput(cfile);
5010 out_put_eventfd: 5022 out_put_eventfd:
5011 eventfd_ctx_put(event->eventfd); 5023 eventfd_ctx_put(event->eventfd);
5012 out_put_efile: 5024 out_put_efile:
5013 fdput(efile); 5025 fdput(efile);
5014 out_kfree: 5026 out_kfree:
5015 kfree(event); 5027 kfree(event);
5016 5028
5017 return ret; 5029 return ret;
5018 } 5030 }
5019 5031
5020 static struct cftype mem_cgroup_files[] = { 5032 static struct cftype mem_cgroup_files[] = {
5021 { 5033 {
5022 .name = "usage_in_bytes", 5034 .name = "usage_in_bytes",
5023 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5035 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
5024 .read_u64 = mem_cgroup_read_u64, 5036 .read_u64 = mem_cgroup_read_u64,
5025 }, 5037 },
5026 { 5038 {
5027 .name = "max_usage_in_bytes", 5039 .name = "max_usage_in_bytes",
5028 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5040 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
5029 .write = mem_cgroup_reset, 5041 .write = mem_cgroup_reset,
5030 .read_u64 = mem_cgroup_read_u64, 5042 .read_u64 = mem_cgroup_read_u64,
5031 }, 5043 },
5032 { 5044 {
5033 .name = "limit_in_bytes", 5045 .name = "limit_in_bytes",
5034 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5046 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
5035 .write = mem_cgroup_write, 5047 .write = mem_cgroup_write,
5036 .read_u64 = mem_cgroup_read_u64, 5048 .read_u64 = mem_cgroup_read_u64,
5037 }, 5049 },
5038 { 5050 {
5039 .name = "soft_limit_in_bytes", 5051 .name = "soft_limit_in_bytes",
5040 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 5052 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
5041 .write = mem_cgroup_write, 5053 .write = mem_cgroup_write,
5042 .read_u64 = mem_cgroup_read_u64, 5054 .read_u64 = mem_cgroup_read_u64,
5043 }, 5055 },
5044 { 5056 {
5045 .name = "failcnt", 5057 .name = "failcnt",
5046 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 5058 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
5047 .write = mem_cgroup_reset, 5059 .write = mem_cgroup_reset,
5048 .read_u64 = mem_cgroup_read_u64, 5060 .read_u64 = mem_cgroup_read_u64,
5049 }, 5061 },
5050 { 5062 {
5051 .name = "stat", 5063 .name = "stat",
5052 .seq_show = memcg_stat_show, 5064 .seq_show = memcg_stat_show,
5053 }, 5065 },
5054 { 5066 {
5055 .name = "force_empty", 5067 .name = "force_empty",
5056 .write = mem_cgroup_force_empty_write, 5068 .write = mem_cgroup_force_empty_write,
5057 }, 5069 },
5058 { 5070 {
5059 .name = "use_hierarchy", 5071 .name = "use_hierarchy",
5060 .write_u64 = mem_cgroup_hierarchy_write, 5072 .write_u64 = mem_cgroup_hierarchy_write,
5061 .read_u64 = mem_cgroup_hierarchy_read, 5073 .read_u64 = mem_cgroup_hierarchy_read,
5062 }, 5074 },
5063 { 5075 {
5064 .name = "cgroup.event_control", /* XXX: for compat */ 5076 .name = "cgroup.event_control", /* XXX: for compat */
5065 .write = memcg_write_event_control, 5077 .write = memcg_write_event_control,
5066 .flags = CFTYPE_NO_PREFIX, 5078 .flags = CFTYPE_NO_PREFIX,
5067 .mode = S_IWUGO, 5079 .mode = S_IWUGO,
5068 }, 5080 },
5069 { 5081 {
5070 .name = "swappiness", 5082 .name = "swappiness",
5071 .read_u64 = mem_cgroup_swappiness_read, 5083 .read_u64 = mem_cgroup_swappiness_read,
5072 .write_u64 = mem_cgroup_swappiness_write, 5084 .write_u64 = mem_cgroup_swappiness_write,
5073 }, 5085 },
5074 { 5086 {
5075 .name = "move_charge_at_immigrate", 5087 .name = "move_charge_at_immigrate",
5076 .read_u64 = mem_cgroup_move_charge_read, 5088 .read_u64 = mem_cgroup_move_charge_read,
5077 .write_u64 = mem_cgroup_move_charge_write, 5089 .write_u64 = mem_cgroup_move_charge_write,
5078 }, 5090 },
5079 { 5091 {
5080 .name = "oom_control", 5092 .name = "oom_control",
5081 .seq_show = mem_cgroup_oom_control_read, 5093 .seq_show = mem_cgroup_oom_control_read,
5082 .write_u64 = mem_cgroup_oom_control_write, 5094 .write_u64 = mem_cgroup_oom_control_write,
5083 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 5095 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
5084 }, 5096 },
5085 { 5097 {
5086 .name = "pressure_level", 5098 .name = "pressure_level",
5087 }, 5099 },
5088 #ifdef CONFIG_NUMA 5100 #ifdef CONFIG_NUMA
5089 { 5101 {
5090 .name = "numa_stat", 5102 .name = "numa_stat",
5091 .seq_show = memcg_numa_stat_show, 5103 .seq_show = memcg_numa_stat_show,
5092 }, 5104 },
5093 #endif 5105 #endif
5094 #ifdef CONFIG_MEMCG_KMEM 5106 #ifdef CONFIG_MEMCG_KMEM
5095 { 5107 {
5096 .name = "kmem.limit_in_bytes", 5108 .name = "kmem.limit_in_bytes",
5097 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 5109 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
5098 .write = mem_cgroup_write, 5110 .write = mem_cgroup_write,
5099 .read_u64 = mem_cgroup_read_u64, 5111 .read_u64 = mem_cgroup_read_u64,
5100 }, 5112 },
5101 { 5113 {
5102 .name = "kmem.usage_in_bytes", 5114 .name = "kmem.usage_in_bytes",
5103 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 5115 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5104 .read_u64 = mem_cgroup_read_u64, 5116 .read_u64 = mem_cgroup_read_u64,
5105 }, 5117 },
5106 { 5118 {
5107 .name = "kmem.failcnt", 5119 .name = "kmem.failcnt",
5108 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 5120 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5109 .write = mem_cgroup_reset, 5121 .write = mem_cgroup_reset,
5110 .read_u64 = mem_cgroup_read_u64, 5122 .read_u64 = mem_cgroup_read_u64,
5111 }, 5123 },
5112 { 5124 {
5113 .name = "kmem.max_usage_in_bytes", 5125 .name = "kmem.max_usage_in_bytes",
5114 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 5126 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5115 .write = mem_cgroup_reset, 5127 .write = mem_cgroup_reset,
5116 .read_u64 = mem_cgroup_read_u64, 5128 .read_u64 = mem_cgroup_read_u64,
5117 }, 5129 },
5118 #ifdef CONFIG_SLABINFO 5130 #ifdef CONFIG_SLABINFO
5119 { 5131 {
5120 .name = "kmem.slabinfo", 5132 .name = "kmem.slabinfo",
5121 .seq_show = mem_cgroup_slabinfo_read, 5133 .seq_show = mem_cgroup_slabinfo_read,
5122 }, 5134 },
5123 #endif 5135 #endif
5124 #endif 5136 #endif
5125 { }, /* terminate */ 5137 { }, /* terminate */
5126 }; 5138 };
5127 5139
5128 #ifdef CONFIG_MEMCG_SWAP 5140 #ifdef CONFIG_MEMCG_SWAP
5129 static struct cftype memsw_cgroup_files[] = { 5141 static struct cftype memsw_cgroup_files[] = {
5130 { 5142 {
5131 .name = "memsw.usage_in_bytes", 5143 .name = "memsw.usage_in_bytes",
5132 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 5144 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5133 .read_u64 = mem_cgroup_read_u64, 5145 .read_u64 = mem_cgroup_read_u64,
5134 }, 5146 },
5135 { 5147 {
5136 .name = "memsw.max_usage_in_bytes", 5148 .name = "memsw.max_usage_in_bytes",
5137 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 5149 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5138 .write = mem_cgroup_reset, 5150 .write = mem_cgroup_reset,
5139 .read_u64 = mem_cgroup_read_u64, 5151 .read_u64 = mem_cgroup_read_u64,
5140 }, 5152 },
5141 { 5153 {
5142 .name = "memsw.limit_in_bytes", 5154 .name = "memsw.limit_in_bytes",
5143 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 5155 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5144 .write = mem_cgroup_write, 5156 .write = mem_cgroup_write,
5145 .read_u64 = mem_cgroup_read_u64, 5157 .read_u64 = mem_cgroup_read_u64,
5146 }, 5158 },
5147 { 5159 {
5148 .name = "memsw.failcnt", 5160 .name = "memsw.failcnt",
5149 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 5161 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5150 .write = mem_cgroup_reset, 5162 .write = mem_cgroup_reset,
5151 .read_u64 = mem_cgroup_read_u64, 5163 .read_u64 = mem_cgroup_read_u64,
5152 }, 5164 },
5153 { }, /* terminate */ 5165 { }, /* terminate */
5154 }; 5166 };
5155 #endif 5167 #endif
5156 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5168 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5157 { 5169 {
5158 struct mem_cgroup_per_node *pn; 5170 struct mem_cgroup_per_node *pn;
5159 struct mem_cgroup_per_zone *mz; 5171 struct mem_cgroup_per_zone *mz;
5160 int zone, tmp = node; 5172 int zone, tmp = node;
5161 /* 5173 /*
5162 * This routine is called against possible nodes. 5174 * This routine is called against possible nodes.
5163 * But it's BUG to call kmalloc() against offline node. 5175 * But it's BUG to call kmalloc() against offline node.
5164 * 5176 *
5165 * TODO: this routine can waste much memory for nodes which will 5177 * TODO: this routine can waste much memory for nodes which will
5166 * never be onlined. It's better to use memory hotplug callback 5178 * never be onlined. It's better to use memory hotplug callback
5167 * function. 5179 * function.
5168 */ 5180 */
5169 if (!node_state(node, N_NORMAL_MEMORY)) 5181 if (!node_state(node, N_NORMAL_MEMORY))
5170 tmp = -1; 5182 tmp = -1;
5171 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 5183 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5172 if (!pn) 5184 if (!pn)
5173 return 1; 5185 return 1;
5174 5186
5175 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5187 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5176 mz = &pn->zoneinfo[zone]; 5188 mz = &pn->zoneinfo[zone];
5177 lruvec_init(&mz->lruvec); 5189 lruvec_init(&mz->lruvec);
5178 mz->usage_in_excess = 0; 5190 mz->usage_in_excess = 0;
5179 mz->on_tree = false; 5191 mz->on_tree = false;
5180 mz->memcg = memcg; 5192 mz->memcg = memcg;
5181 } 5193 }
5182 memcg->nodeinfo[node] = pn; 5194 memcg->nodeinfo[node] = pn;
5183 return 0; 5195 return 0;
5184 } 5196 }
5185 5197
5186 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 5198 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
5187 { 5199 {
5188 kfree(memcg->nodeinfo[node]); 5200 kfree(memcg->nodeinfo[node]);
5189 } 5201 }
5190 5202
5191 static struct mem_cgroup *mem_cgroup_alloc(void) 5203 static struct mem_cgroup *mem_cgroup_alloc(void)
5192 { 5204 {
5193 struct mem_cgroup *memcg; 5205 struct mem_cgroup *memcg;
5194 size_t size; 5206 size_t size;
5195 5207
5196 size = sizeof(struct mem_cgroup); 5208 size = sizeof(struct mem_cgroup);
5197 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 5209 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
5198 5210
5199 memcg = kzalloc(size, GFP_KERNEL); 5211 memcg = kzalloc(size, GFP_KERNEL);
5200 if (!memcg) 5212 if (!memcg)
5201 return NULL; 5213 return NULL;
5202 5214
5203 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 5215 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
5204 if (!memcg->stat) 5216 if (!memcg->stat)
5205 goto out_free; 5217 goto out_free;
5206 spin_lock_init(&memcg->pcp_counter_lock); 5218 spin_lock_init(&memcg->pcp_counter_lock);
5207 return memcg; 5219 return memcg;
5208 5220
5209 out_free: 5221 out_free:
5210 kfree(memcg); 5222 kfree(memcg);
5211 return NULL; 5223 return NULL;
5212 } 5224 }
5213 5225
5214 /* 5226 /*
5215 * At destroying mem_cgroup, references from swap_cgroup can remain. 5227 * At destroying mem_cgroup, references from swap_cgroup can remain.
5216 * (scanning all at force_empty is too costly...) 5228 * (scanning all at force_empty is too costly...)
5217 * 5229 *
5218 * Instead of clearing all references at force_empty, we remember 5230 * Instead of clearing all references at force_empty, we remember
5219 * the number of reference from swap_cgroup and free mem_cgroup when 5231 * the number of reference from swap_cgroup and free mem_cgroup when
5220 * it goes down to 0. 5232 * it goes down to 0.
5221 * 5233 *
5222 * Removal of cgroup itself succeeds regardless of refs from swap. 5234 * Removal of cgroup itself succeeds regardless of refs from swap.
5223 */ 5235 */
5224 5236
5225 static void __mem_cgroup_free(struct mem_cgroup *memcg) 5237 static void __mem_cgroup_free(struct mem_cgroup *memcg)
5226 { 5238 {
5227 int node; 5239 int node;
5228 5240
5229 mem_cgroup_remove_from_trees(memcg); 5241 mem_cgroup_remove_from_trees(memcg);
5230 5242
5231 for_each_node(node) 5243 for_each_node(node)
5232 free_mem_cgroup_per_zone_info(memcg, node); 5244 free_mem_cgroup_per_zone_info(memcg, node);
5233 5245
5234 free_percpu(memcg->stat); 5246 free_percpu(memcg->stat);
5235 5247
5236 /* 5248 /*
5237 * We need to make sure that (at least for now), the jump label 5249 * We need to make sure that (at least for now), the jump label
5238 * destruction code runs outside of the cgroup lock. This is because 5250 * destruction code runs outside of the cgroup lock. This is because
5239 * get_online_cpus(), which is called from the static_branch update, 5251 * get_online_cpus(), which is called from the static_branch update,
5240 * can't be called inside the cgroup_lock. cpusets are the ones 5252 * can't be called inside the cgroup_lock. cpusets are the ones
5241 * enforcing this dependency, so if they ever change, we might as well. 5253 * enforcing this dependency, so if they ever change, we might as well.
5242 * 5254 *
5243 * schedule_work() will guarantee this happens. Be careful if you need 5255 * schedule_work() will guarantee this happens. Be careful if you need
5244 * to move this code around, and make sure it is outside 5256 * to move this code around, and make sure it is outside
5245 * the cgroup_lock. 5257 * the cgroup_lock.
5246 */ 5258 */
5247 disarm_static_keys(memcg); 5259 disarm_static_keys(memcg);
5248 kfree(memcg); 5260 kfree(memcg);
5249 } 5261 }
5250 5262
5251 /* 5263 /*
5252 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 5264 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
5253 */ 5265 */
5254 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 5266 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
5255 { 5267 {
5256 if (!memcg->memory.parent) 5268 if (!memcg->memory.parent)
5257 return NULL; 5269 return NULL;
5258 return mem_cgroup_from_counter(memcg->memory.parent, memory); 5270 return mem_cgroup_from_counter(memcg->memory.parent, memory);
5259 } 5271 }
5260 EXPORT_SYMBOL(parent_mem_cgroup); 5272 EXPORT_SYMBOL(parent_mem_cgroup);
5261 5273
5262 static void __init mem_cgroup_soft_limit_tree_init(void) 5274 static void __init mem_cgroup_soft_limit_tree_init(void)
5263 { 5275 {
5264 struct mem_cgroup_tree_per_node *rtpn; 5276 struct mem_cgroup_tree_per_node *rtpn;
5265 struct mem_cgroup_tree_per_zone *rtpz; 5277 struct mem_cgroup_tree_per_zone *rtpz;
5266 int tmp, node, zone; 5278 int tmp, node, zone;
5267 5279
5268 for_each_node(node) { 5280 for_each_node(node) {
5269 tmp = node; 5281 tmp = node;
5270 if (!node_state(node, N_NORMAL_MEMORY)) 5282 if (!node_state(node, N_NORMAL_MEMORY))
5271 tmp = -1; 5283 tmp = -1;
5272 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 5284 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
5273 BUG_ON(!rtpn); 5285 BUG_ON(!rtpn);
5274 5286
5275 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5287 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5276 5288
5277 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5289 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5278 rtpz = &rtpn->rb_tree_per_zone[zone]; 5290 rtpz = &rtpn->rb_tree_per_zone[zone];
5279 rtpz->rb_root = RB_ROOT; 5291 rtpz->rb_root = RB_ROOT;
5280 spin_lock_init(&rtpz->lock); 5292 spin_lock_init(&rtpz->lock);
5281 } 5293 }
5282 } 5294 }
5283 } 5295 }
5284 5296
5285 static struct cgroup_subsys_state * __ref 5297 static struct cgroup_subsys_state * __ref
5286 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 5298 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5287 { 5299 {
5288 struct mem_cgroup *memcg; 5300 struct mem_cgroup *memcg;
5289 long error = -ENOMEM; 5301 long error = -ENOMEM;
5290 int node; 5302 int node;
5291 5303
5292 memcg = mem_cgroup_alloc(); 5304 memcg = mem_cgroup_alloc();
5293 if (!memcg) 5305 if (!memcg)
5294 return ERR_PTR(error); 5306 return ERR_PTR(error);
5295 5307
5296 for_each_node(node) 5308 for_each_node(node)
5297 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 5309 if (alloc_mem_cgroup_per_zone_info(memcg, node))
5298 goto free_out; 5310 goto free_out;
5299 5311
5300 /* root ? */ 5312 /* root ? */
5301 if (parent_css == NULL) { 5313 if (parent_css == NULL) {
5302 root_mem_cgroup = memcg; 5314 root_mem_cgroup = memcg;
5303 page_counter_init(&memcg->memory, NULL); 5315 page_counter_init(&memcg->memory, NULL);
5304 page_counter_init(&memcg->memsw, NULL); 5316 page_counter_init(&memcg->memsw, NULL);
5305 page_counter_init(&memcg->kmem, NULL); 5317 page_counter_init(&memcg->kmem, NULL);
5306 } 5318 }
5307 5319
5308 memcg->last_scanned_node = MAX_NUMNODES; 5320 memcg->last_scanned_node = MAX_NUMNODES;
5309 INIT_LIST_HEAD(&memcg->oom_notify); 5321 INIT_LIST_HEAD(&memcg->oom_notify);
5310 memcg->move_charge_at_immigrate = 0; 5322 memcg->move_charge_at_immigrate = 0;
5311 mutex_init(&memcg->thresholds_lock); 5323 mutex_init(&memcg->thresholds_lock);
5312 spin_lock_init(&memcg->move_lock); 5324 spin_lock_init(&memcg->move_lock);
5313 vmpressure_init(&memcg->vmpressure); 5325 vmpressure_init(&memcg->vmpressure);
5314 INIT_LIST_HEAD(&memcg->event_list); 5326 INIT_LIST_HEAD(&memcg->event_list);
5315 spin_lock_init(&memcg->event_list_lock); 5327 spin_lock_init(&memcg->event_list_lock);
5316 5328
5317 return &memcg->css; 5329 return &memcg->css;
5318 5330
5319 free_out: 5331 free_out:
5320 __mem_cgroup_free(memcg); 5332 __mem_cgroup_free(memcg);
5321 return ERR_PTR(error); 5333 return ERR_PTR(error);
5322 } 5334 }
5323 5335
5324 static int 5336 static int
5325 mem_cgroup_css_online(struct cgroup_subsys_state *css) 5337 mem_cgroup_css_online(struct cgroup_subsys_state *css)
5326 { 5338 {
5327 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5339 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5328 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); 5340 struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
5329 int ret; 5341 int ret;
5330 5342
5331 if (css->id > MEM_CGROUP_ID_MAX) 5343 if (css->id > MEM_CGROUP_ID_MAX)
5332 return -ENOSPC; 5344 return -ENOSPC;
5333 5345
5334 if (!parent) 5346 if (!parent)
5335 return 0; 5347 return 0;
5336 5348
5337 mutex_lock(&memcg_create_mutex); 5349 mutex_lock(&memcg_create_mutex);
5338 5350
5339 memcg->use_hierarchy = parent->use_hierarchy; 5351 memcg->use_hierarchy = parent->use_hierarchy;
5340 memcg->oom_kill_disable = parent->oom_kill_disable; 5352 memcg->oom_kill_disable = parent->oom_kill_disable;
5341 memcg->swappiness = mem_cgroup_swappiness(parent); 5353 memcg->swappiness = mem_cgroup_swappiness(parent);
5342 5354
5343 if (parent->use_hierarchy) { 5355 if (parent->use_hierarchy) {
5344 page_counter_init(&memcg->memory, &parent->memory); 5356 page_counter_init(&memcg->memory, &parent->memory);
5345 page_counter_init(&memcg->memsw, &parent->memsw); 5357 page_counter_init(&memcg->memsw, &parent->memsw);
5346 page_counter_init(&memcg->kmem, &parent->kmem); 5358 page_counter_init(&memcg->kmem, &parent->kmem);
5347 5359
5348 /* 5360 /*
5349 * No need to take a reference to the parent because cgroup 5361 * No need to take a reference to the parent because cgroup
5350 * core guarantees its existence. 5362 * core guarantees its existence.
5351 */ 5363 */
5352 } else { 5364 } else {
5353 page_counter_init(&memcg->memory, NULL); 5365 page_counter_init(&memcg->memory, NULL);
5354 page_counter_init(&memcg->memsw, NULL); 5366 page_counter_init(&memcg->memsw, NULL);
5355 page_counter_init(&memcg->kmem, NULL); 5367 page_counter_init(&memcg->kmem, NULL);
5356 /* 5368 /*
5357 * Deeper hierachy with use_hierarchy == false doesn't make 5369 * Deeper hierachy with use_hierarchy == false doesn't make
5358 * much sense so let cgroup subsystem know about this 5370 * much sense so let cgroup subsystem know about this
5359 * unfortunate state in our controller. 5371 * unfortunate state in our controller.
5360 */ 5372 */
5361 if (parent != root_mem_cgroup) 5373 if (parent != root_mem_cgroup)
5362 memory_cgrp_subsys.broken_hierarchy = true; 5374 memory_cgrp_subsys.broken_hierarchy = true;
5363 } 5375 }
5364 mutex_unlock(&memcg_create_mutex); 5376 mutex_unlock(&memcg_create_mutex);
5365 5377
5366 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); 5378 ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
5367 if (ret) 5379 if (ret)
5368 return ret; 5380 return ret;
5369 5381
5370 /* 5382 /*
5371 * Make sure the memcg is initialized: mem_cgroup_iter() 5383 * Make sure the memcg is initialized: mem_cgroup_iter()
5372 * orders reading memcg->initialized against its callers 5384 * orders reading memcg->initialized against its callers
5373 * reading the memcg members. 5385 * reading the memcg members.
5374 */ 5386 */
5375 smp_store_release(&memcg->initialized, 1); 5387 smp_store_release(&memcg->initialized, 1);
5376 5388
5377 return 0; 5389 return 0;
5378 } 5390 }
5379 5391
5380 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 5392 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5381 { 5393 {
5382 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5394 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5383 struct mem_cgroup_event *event, *tmp; 5395 struct mem_cgroup_event *event, *tmp;
5384 struct cgroup_subsys_state *iter; 5396 struct cgroup_subsys_state *iter;
5385 5397
5386 /* 5398 /*
5387 * Unregister events and notify userspace. 5399 * Unregister events and notify userspace.
5388 * Notify userspace about cgroup removing only after rmdir of cgroup 5400 * Notify userspace about cgroup removing only after rmdir of cgroup
5389 * directory to avoid race between userspace and kernelspace. 5401 * directory to avoid race between userspace and kernelspace.
5390 */ 5402 */
5391 spin_lock(&memcg->event_list_lock); 5403 spin_lock(&memcg->event_list_lock);
5392 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 5404 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5393 list_del_init(&event->list); 5405 list_del_init(&event->list);
5394 schedule_work(&event->remove); 5406 schedule_work(&event->remove);
5395 } 5407 }
5396 spin_unlock(&memcg->event_list_lock); 5408 spin_unlock(&memcg->event_list_lock);
5397 5409
5398 kmem_cgroup_css_offline(memcg); 5410 kmem_cgroup_css_offline(memcg);
5399 5411
5400 /* 5412 /*
5401 * This requires that offlining is serialized. Right now that is 5413 * This requires that offlining is serialized. Right now that is
5402 * guaranteed because css_killed_work_fn() holds the cgroup_mutex. 5414 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
5403 */ 5415 */
5404 css_for_each_descendant_post(iter, css) 5416 css_for_each_descendant_post(iter, css)
5405 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); 5417 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
5406 5418
5407 memcg_unregister_all_caches(memcg); 5419 memcg_unregister_all_caches(memcg);
5408 vmpressure_cleanup(&memcg->vmpressure); 5420 vmpressure_cleanup(&memcg->vmpressure);
5409 } 5421 }
5410 5422
5411 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 5423 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5412 { 5424 {
5413 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5425 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5414 /* 5426 /*
5415 * XXX: css_offline() would be where we should reparent all 5427 * XXX: css_offline() would be where we should reparent all
5416 * memory to prepare the cgroup for destruction. However, 5428 * memory to prepare the cgroup for destruction. However,
5417 * memcg does not do css_tryget_online() and page_counter charging 5429 * memcg does not do css_tryget_online() and page_counter charging
5418 * under the same RCU lock region, which means that charging 5430 * under the same RCU lock region, which means that charging
5419 * could race with offlining. Offlining only happens to 5431 * could race with offlining. Offlining only happens to
5420 * cgroups with no tasks in them but charges can show up 5432 * cgroups with no tasks in them but charges can show up
5421 * without any tasks from the swapin path when the target 5433 * without any tasks from the swapin path when the target
5422 * memcg is looked up from the swapout record and not from the 5434 * memcg is looked up from the swapout record and not from the
5423 * current task as it usually is. A race like this can leak 5435 * current task as it usually is. A race like this can leak
5424 * charges and put pages with stale cgroup pointers into 5436 * charges and put pages with stale cgroup pointers into
5425 * circulation: 5437 * circulation:
5426 * 5438 *
5427 * #0 #1 5439 * #0 #1
5428 * lookup_swap_cgroup_id() 5440 * lookup_swap_cgroup_id()
5429 * rcu_read_lock() 5441 * rcu_read_lock()
5430 * mem_cgroup_lookup() 5442 * mem_cgroup_lookup()
5431 * css_tryget_online() 5443 * css_tryget_online()
5432 * rcu_read_unlock() 5444 * rcu_read_unlock()
5433 * disable css_tryget_online() 5445 * disable css_tryget_online()
5434 * call_rcu() 5446 * call_rcu()
5435 * offline_css() 5447 * offline_css()
5436 * reparent_charges() 5448 * reparent_charges()
5437 * page_counter_try_charge() 5449 * page_counter_try_charge()
5438 * css_put() 5450 * css_put()
5439 * css_free() 5451 * css_free()
5440 * pc->mem_cgroup = dead memcg 5452 * pc->mem_cgroup = dead memcg
5441 * add page to lru 5453 * add page to lru
5442 * 5454 *
5443 * The bulk of the charges are still moved in offline_css() to 5455 * The bulk of the charges are still moved in offline_css() to
5444 * avoid pinning a lot of pages in case a long-term reference 5456 * avoid pinning a lot of pages in case a long-term reference
5445 * like a swapout record is deferring the css_free() to long 5457 * like a swapout record is deferring the css_free() to long
5446 * after offlining. But this makes sure we catch any charges 5458 * after offlining. But this makes sure we catch any charges
5447 * made after offlining: 5459 * made after offlining:
5448 */ 5460 */
5449 mem_cgroup_reparent_charges(memcg); 5461 mem_cgroup_reparent_charges(memcg);
5450 5462
5451 memcg_destroy_kmem(memcg); 5463 memcg_destroy_kmem(memcg);
5452 __mem_cgroup_free(memcg); 5464 __mem_cgroup_free(memcg);
5453 } 5465 }
5454 5466
5455 /** 5467 /**
5456 * mem_cgroup_css_reset - reset the states of a mem_cgroup 5468 * mem_cgroup_css_reset - reset the states of a mem_cgroup
5457 * @css: the target css 5469 * @css: the target css
5458 * 5470 *
5459 * Reset the states of the mem_cgroup associated with @css. This is 5471 * Reset the states of the mem_cgroup associated with @css. This is
5460 * invoked when the userland requests disabling on the default hierarchy 5472 * invoked when the userland requests disabling on the default hierarchy
5461 * but the memcg is pinned through dependency. The memcg should stop 5473 * but the memcg is pinned through dependency. The memcg should stop
5462 * applying policies and should revert to the vanilla state as it may be 5474 * applying policies and should revert to the vanilla state as it may be
5463 * made visible again. 5475 * made visible again.
5464 * 5476 *
5465 * The current implementation only resets the essential configurations. 5477 * The current implementation only resets the essential configurations.
5466 * This needs to be expanded to cover all the visible parts. 5478 * This needs to be expanded to cover all the visible parts.
5467 */ 5479 */
5468 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 5480 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5469 { 5481 {
5470 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5482 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5471 5483
5472 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); 5484 mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
5473 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); 5485 mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
5474 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); 5486 memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
5475 memcg->soft_limit = 0; 5487 memcg->soft_limit = 0;
5476 } 5488 }
5477 5489
5478 #ifdef CONFIG_MMU 5490 #ifdef CONFIG_MMU
5479 /* Handlers for move charge at task migration. */ 5491 /* Handlers for move charge at task migration. */
5480 static int mem_cgroup_do_precharge(unsigned long count) 5492 static int mem_cgroup_do_precharge(unsigned long count)
5481 { 5493 {
5482 int ret; 5494 int ret;
5483 5495
5484 /* Try a single bulk charge without reclaim first */ 5496 /* Try a single bulk charge without reclaim first */
5485 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); 5497 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
5486 if (!ret) { 5498 if (!ret) {
5487 mc.precharge += count; 5499 mc.precharge += count;
5488 return ret; 5500 return ret;
5489 } 5501 }
5490 if (ret == -EINTR) { 5502 if (ret == -EINTR) {
5491 cancel_charge(root_mem_cgroup, count); 5503 cancel_charge(root_mem_cgroup, count);
5492 return ret; 5504 return ret;
5493 } 5505 }
5494 5506
5495 /* Try charges one by one with reclaim */ 5507 /* Try charges one by one with reclaim */
5496 while (count--) { 5508 while (count--) {
5497 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); 5509 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
5498 /* 5510 /*
5499 * In case of failure, any residual charges against 5511 * In case of failure, any residual charges against
5500 * mc.to will be dropped by mem_cgroup_clear_mc() 5512 * mc.to will be dropped by mem_cgroup_clear_mc()
5501 * later on. However, cancel any charges that are 5513 * later on. However, cancel any charges that are
5502 * bypassed to root right away or they'll be lost. 5514 * bypassed to root right away or they'll be lost.
5503 */ 5515 */
5504 if (ret == -EINTR) 5516 if (ret == -EINTR)
5505 cancel_charge(root_mem_cgroup, 1); 5517 cancel_charge(root_mem_cgroup, 1);
5506 if (ret) 5518 if (ret)
5507 return ret; 5519 return ret;
5508 mc.precharge++; 5520 mc.precharge++;
5509 cond_resched(); 5521 cond_resched();
5510 } 5522 }
5511 return 0; 5523 return 0;
5512 } 5524 }
5513 5525
5514 /** 5526 /**
5515 * get_mctgt_type - get target type of moving charge 5527 * get_mctgt_type - get target type of moving charge
5516 * @vma: the vma the pte to be checked belongs 5528 * @vma: the vma the pte to be checked belongs
5517 * @addr: the address corresponding to the pte to be checked 5529 * @addr: the address corresponding to the pte to be checked
5518 * @ptent: the pte to be checked 5530 * @ptent: the pte to be checked
5519 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5531 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5520 * 5532 *
5521 * Returns 5533 * Returns
5522 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5534 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
5523 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5535 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5524 * move charge. if @target is not NULL, the page is stored in target->page 5536 * move charge. if @target is not NULL, the page is stored in target->page
5525 * with extra refcnt got(Callers should handle it). 5537 * with extra refcnt got(Callers should handle it).
5526 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5538 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5527 * target for charge migration. if @target is not NULL, the entry is stored 5539 * target for charge migration. if @target is not NULL, the entry is stored
5528 * in target->ent. 5540 * in target->ent.
5529 * 5541 *
5530 * Called with pte lock held. 5542 * Called with pte lock held.
5531 */ 5543 */
5532 union mc_target { 5544 union mc_target {
5533 struct page *page; 5545 struct page *page;
5534 swp_entry_t ent; 5546 swp_entry_t ent;
5535 }; 5547 };
5536 5548
5537 enum mc_target_type { 5549 enum mc_target_type {
5538 MC_TARGET_NONE = 0, 5550 MC_TARGET_NONE = 0,
5539 MC_TARGET_PAGE, 5551 MC_TARGET_PAGE,
5540 MC_TARGET_SWAP, 5552 MC_TARGET_SWAP,
5541 }; 5553 };
5542 5554
5543 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5555 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5544 unsigned long addr, pte_t ptent) 5556 unsigned long addr, pte_t ptent)
5545 { 5557 {
5546 struct page *page = vm_normal_page(vma, addr, ptent); 5558 struct page *page = vm_normal_page(vma, addr, ptent);
5547 5559
5548 if (!page || !page_mapped(page)) 5560 if (!page || !page_mapped(page))
5549 return NULL; 5561 return NULL;
5550 if (PageAnon(page)) { 5562 if (PageAnon(page)) {
5551 /* we don't move shared anon */ 5563 /* we don't move shared anon */
5552 if (!move_anon()) 5564 if (!move_anon())
5553 return NULL; 5565 return NULL;
5554 } else if (!move_file()) 5566 } else if (!move_file())
5555 /* we ignore mapcount for file pages */ 5567 /* we ignore mapcount for file pages */
5556 return NULL; 5568 return NULL;
5557 if (!get_page_unless_zero(page)) 5569 if (!get_page_unless_zero(page))
5558 return NULL; 5570 return NULL;
5559 5571
5560 return page; 5572 return page;
5561 } 5573 }
5562 5574
5563 #ifdef CONFIG_SWAP 5575 #ifdef CONFIG_SWAP
5564 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5576 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5565 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5577 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5566 { 5578 {
5567 struct page *page = NULL; 5579 struct page *page = NULL;
5568 swp_entry_t ent = pte_to_swp_entry(ptent); 5580 swp_entry_t ent = pte_to_swp_entry(ptent);
5569 5581
5570 if (!move_anon() || non_swap_entry(ent)) 5582 if (!move_anon() || non_swap_entry(ent))
5571 return NULL; 5583 return NULL;
5572 /* 5584 /*
5573 * Because lookup_swap_cache() updates some statistics counter, 5585 * Because lookup_swap_cache() updates some statistics counter,
5574 * we call find_get_page() with swapper_space directly. 5586 * we call find_get_page() with swapper_space directly.
5575 */ 5587 */
5576 page = find_get_page(swap_address_space(ent), ent.val); 5588 page = find_get_page(swap_address_space(ent), ent.val);
5577 if (do_swap_account) 5589 if (do_swap_account)
5578 entry->val = ent.val; 5590 entry->val = ent.val;
5579 5591
5580 return page; 5592 return page;
5581 } 5593 }
5582 #else 5594 #else
5583 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5595 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5584 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5596 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5585 { 5597 {
5586 return NULL; 5598 return NULL;
5587 } 5599 }
5588 #endif 5600 #endif
5589 5601
5590 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5602 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5591 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5603 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5592 { 5604 {
5593 struct page *page = NULL; 5605 struct page *page = NULL;
5594 struct address_space *mapping; 5606 struct address_space *mapping;
5595 pgoff_t pgoff; 5607 pgoff_t pgoff;
5596 5608
5597 if (!vma->vm_file) /* anonymous vma */ 5609 if (!vma->vm_file) /* anonymous vma */
5598 return NULL; 5610 return NULL;
5599 if (!move_file()) 5611 if (!move_file())
5600 return NULL; 5612 return NULL;
5601 5613
5602 mapping = vma->vm_file->f_mapping; 5614 mapping = vma->vm_file->f_mapping;
5603 if (pte_none(ptent)) 5615 if (pte_none(ptent))
5604 pgoff = linear_page_index(vma, addr); 5616 pgoff = linear_page_index(vma, addr);
5605 else /* pte_file(ptent) is true */ 5617 else /* pte_file(ptent) is true */
5606 pgoff = pte_to_pgoff(ptent); 5618 pgoff = pte_to_pgoff(ptent);
5607 5619
5608 /* page is moved even if it's not RSS of this task(page-faulted). */ 5620 /* page is moved even if it's not RSS of this task(page-faulted). */
5609 #ifdef CONFIG_SWAP 5621 #ifdef CONFIG_SWAP
5610 /* shmem/tmpfs may report page out on swap: account for that too. */ 5622 /* shmem/tmpfs may report page out on swap: account for that too. */
5611 if (shmem_mapping(mapping)) { 5623 if (shmem_mapping(mapping)) {
5612 page = find_get_entry(mapping, pgoff); 5624 page = find_get_entry(mapping, pgoff);
5613 if (radix_tree_exceptional_entry(page)) { 5625 if (radix_tree_exceptional_entry(page)) {
5614 swp_entry_t swp = radix_to_swp_entry(page); 5626 swp_entry_t swp = radix_to_swp_entry(page);
5615 if (do_swap_account) 5627 if (do_swap_account)
5616 *entry = swp; 5628 *entry = swp;
5617 page = find_get_page(swap_address_space(swp), swp.val); 5629 page = find_get_page(swap_address_space(swp), swp.val);
5618 } 5630 }
5619 } else 5631 } else
5620 page = find_get_page(mapping, pgoff); 5632 page = find_get_page(mapping, pgoff);
5621 #else 5633 #else
5622 page = find_get_page(mapping, pgoff); 5634 page = find_get_page(mapping, pgoff);
5623 #endif 5635 #endif
5624 return page; 5636 return page;
5625 } 5637 }
5626 5638
5627 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 5639 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5628 unsigned long addr, pte_t ptent, union mc_target *target) 5640 unsigned long addr, pte_t ptent, union mc_target *target)
5629 { 5641 {
5630 struct page *page = NULL; 5642 struct page *page = NULL;
5631 struct page_cgroup *pc; 5643 struct page_cgroup *pc;
5632 enum mc_target_type ret = MC_TARGET_NONE; 5644 enum mc_target_type ret = MC_TARGET_NONE;
5633 swp_entry_t ent = { .val = 0 }; 5645 swp_entry_t ent = { .val = 0 };
5634 5646
5635 if (pte_present(ptent)) 5647 if (pte_present(ptent))
5636 page = mc_handle_present_pte(vma, addr, ptent); 5648 page = mc_handle_present_pte(vma, addr, ptent);
5637 else if (is_swap_pte(ptent)) 5649 else if (is_swap_pte(ptent))
5638 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5650 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5639 else if (pte_none(ptent) || pte_file(ptent)) 5651 else if (pte_none(ptent) || pte_file(ptent))
5640 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5652 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5641 5653
5642 if (!page && !ent.val) 5654 if (!page && !ent.val)
5643 return ret; 5655 return ret;
5644 if (page) { 5656 if (page) {
5645 pc = lookup_page_cgroup(page); 5657 pc = lookup_page_cgroup(page);
5646 /* 5658 /*
5647 * Do only loose check w/o serialization. 5659 * Do only loose check w/o serialization.
5648 * mem_cgroup_move_account() checks the pc is valid or 5660 * mem_cgroup_move_account() checks the pc is valid or
5649 * not under LRU exclusion. 5661 * not under LRU exclusion.
5650 */ 5662 */
5651 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5663 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5652 ret = MC_TARGET_PAGE; 5664 ret = MC_TARGET_PAGE;
5653 if (target) 5665 if (target)
5654 target->page = page; 5666 target->page = page;
5655 } 5667 }
5656 if (!ret || !target) 5668 if (!ret || !target)
5657 put_page(page); 5669 put_page(page);
5658 } 5670 }
5659 /* There is a swap entry and a page doesn't exist or isn't charged */ 5671 /* There is a swap entry and a page doesn't exist or isn't charged */
5660 if (ent.val && !ret && 5672 if (ent.val && !ret &&
5661 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 5673 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5662 ret = MC_TARGET_SWAP; 5674 ret = MC_TARGET_SWAP;
5663 if (target) 5675 if (target)
5664 target->ent = ent; 5676 target->ent = ent;
5665 } 5677 }
5666 return ret; 5678 return ret;
5667 } 5679 }
5668 5680
5669 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 5681 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5670 /* 5682 /*
5671 * We don't consider swapping or file mapped pages because THP does not 5683 * We don't consider swapping or file mapped pages because THP does not
5672 * support them for now. 5684 * support them for now.
5673 * Caller should make sure that pmd_trans_huge(pmd) is true. 5685 * Caller should make sure that pmd_trans_huge(pmd) is true.
5674 */ 5686 */
5675 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5687 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5676 unsigned long addr, pmd_t pmd, union mc_target *target) 5688 unsigned long addr, pmd_t pmd, union mc_target *target)
5677 { 5689 {
5678 struct page *page = NULL; 5690 struct page *page = NULL;
5679 struct page_cgroup *pc; 5691 struct page_cgroup *pc;
5680 enum mc_target_type ret = MC_TARGET_NONE; 5692 enum mc_target_type ret = MC_TARGET_NONE;
5681 5693
5682 page = pmd_page(pmd); 5694 page = pmd_page(pmd);
5683 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5695 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5684 if (!move_anon()) 5696 if (!move_anon())
5685 return ret; 5697 return ret;
5686 pc = lookup_page_cgroup(page); 5698 pc = lookup_page_cgroup(page);
5687 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5699 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5688 ret = MC_TARGET_PAGE; 5700 ret = MC_TARGET_PAGE;
5689 if (target) { 5701 if (target) {
5690 get_page(page); 5702 get_page(page);
5691 target->page = page; 5703 target->page = page;
5692 } 5704 }
5693 } 5705 }
5694 return ret; 5706 return ret;
5695 } 5707 }
5696 #else 5708 #else
5697 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 5709 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5698 unsigned long addr, pmd_t pmd, union mc_target *target) 5710 unsigned long addr, pmd_t pmd, union mc_target *target)
5699 { 5711 {
5700 return MC_TARGET_NONE; 5712 return MC_TARGET_NONE;
5701 } 5713 }
5702 #endif 5714 #endif
5703 5715
5704 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5716 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5705 unsigned long addr, unsigned long end, 5717 unsigned long addr, unsigned long end,
5706 struct mm_walk *walk) 5718 struct mm_walk *walk)
5707 { 5719 {
5708 struct vm_area_struct *vma = walk->private; 5720 struct vm_area_struct *vma = walk->private;
5709 pte_t *pte; 5721 pte_t *pte;
5710 spinlock_t *ptl; 5722 spinlock_t *ptl;
5711 5723
5712 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5724 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
5713 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 5725 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5714 mc.precharge += HPAGE_PMD_NR; 5726 mc.precharge += HPAGE_PMD_NR;
5715 spin_unlock(ptl); 5727 spin_unlock(ptl);
5716 return 0; 5728 return 0;
5717 } 5729 }
5718 5730
5719 if (pmd_trans_unstable(pmd)) 5731 if (pmd_trans_unstable(pmd))
5720 return 0; 5732 return 0;
5721 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5733 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5722 for (; addr != end; pte++, addr += PAGE_SIZE) 5734 for (; addr != end; pte++, addr += PAGE_SIZE)
5723 if (get_mctgt_type(vma, addr, *pte, NULL)) 5735 if (get_mctgt_type(vma, addr, *pte, NULL))
5724 mc.precharge++; /* increment precharge temporarily */ 5736 mc.precharge++; /* increment precharge temporarily */
5725 pte_unmap_unlock(pte - 1, ptl); 5737 pte_unmap_unlock(pte - 1, ptl);
5726 cond_resched(); 5738 cond_resched();
5727 5739
5728 return 0; 5740 return 0;
5729 } 5741 }
5730 5742
5731 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5743 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5732 { 5744 {
5733 unsigned long precharge; 5745 unsigned long precharge;
5734 struct vm_area_struct *vma; 5746 struct vm_area_struct *vma;
5735 5747
5736 down_read(&mm->mmap_sem); 5748 down_read(&mm->mmap_sem);
5737 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5749 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5738 struct mm_walk mem_cgroup_count_precharge_walk = { 5750 struct mm_walk mem_cgroup_count_precharge_walk = {
5739 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5751 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5740 .mm = mm, 5752 .mm = mm,
5741 .private = vma, 5753 .private = vma,
5742 }; 5754 };
5743 if (is_vm_hugetlb_page(vma)) 5755 if (is_vm_hugetlb_page(vma))
5744 continue; 5756 continue;
5745 walk_page_range(vma->vm_start, vma->vm_end, 5757 walk_page_range(vma->vm_start, vma->vm_end,
5746 &mem_cgroup_count_precharge_walk); 5758 &mem_cgroup_count_precharge_walk);
5747 } 5759 }
5748 up_read(&mm->mmap_sem); 5760 up_read(&mm->mmap_sem);
5749 5761
5750 precharge = mc.precharge; 5762 precharge = mc.precharge;
5751 mc.precharge = 0; 5763 mc.precharge = 0;
5752 5764
5753 return precharge; 5765 return precharge;
5754 } 5766 }
5755 5767
5756 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5768 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5757 { 5769 {
5758 unsigned long precharge = mem_cgroup_count_precharge(mm); 5770 unsigned long precharge = mem_cgroup_count_precharge(mm);
5759 5771
5760 VM_BUG_ON(mc.moving_task); 5772 VM_BUG_ON(mc.moving_task);
5761 mc.moving_task = current; 5773 mc.moving_task = current;
5762 return mem_cgroup_do_precharge(precharge); 5774 return mem_cgroup_do_precharge(precharge);
5763 } 5775 }
5764 5776
5765 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5777 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5766 static void __mem_cgroup_clear_mc(void) 5778 static void __mem_cgroup_clear_mc(void)
5767 { 5779 {
5768 struct mem_cgroup *from = mc.from; 5780 struct mem_cgroup *from = mc.from;
5769 struct mem_cgroup *to = mc.to; 5781 struct mem_cgroup *to = mc.to;
5770 int i;
5771 5782
5772 /* we must uncharge all the leftover precharges from mc.to */ 5783 /* we must uncharge all the leftover precharges from mc.to */
5773 if (mc.precharge) { 5784 if (mc.precharge) {
5774 cancel_charge(mc.to, mc.precharge); 5785 cancel_charge(mc.to, mc.precharge);
5775 mc.precharge = 0; 5786 mc.precharge = 0;
5776 } 5787 }
5777 /* 5788 /*
5778 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5789 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5779 * we must uncharge here. 5790 * we must uncharge here.
5780 */ 5791 */
5781 if (mc.moved_charge) { 5792 if (mc.moved_charge) {
5782 cancel_charge(mc.from, mc.moved_charge); 5793 cancel_charge(mc.from, mc.moved_charge);
5783 mc.moved_charge = 0; 5794 mc.moved_charge = 0;
5784 } 5795 }
5785 /* we must fixup refcnts and charges */ 5796 /* we must fixup refcnts and charges */
5786 if (mc.moved_swap) { 5797 if (mc.moved_swap) {
5787 /* uncharge swap account from the old cgroup */ 5798 /* uncharge swap account from the old cgroup */
5788 if (!mem_cgroup_is_root(mc.from)) 5799 if (!mem_cgroup_is_root(mc.from))
5789 page_counter_uncharge(&mc.from->memsw, mc.moved_swap); 5800 page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5790 5801
5791 /* 5802 /*
5792 * we charged both to->memory and to->memsw, so we 5803 * we charged both to->memory and to->memsw, so we
5793 * should uncharge to->memory. 5804 * should uncharge to->memory.
5794 */ 5805 */
5795 if (!mem_cgroup_is_root(mc.to)) 5806 if (!mem_cgroup_is_root(mc.to))
5796 page_counter_uncharge(&mc.to->memory, mc.moved_swap); 5807 page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5797 5808
5798 for (i = 0; i < mc.moved_swap; i++) 5809 css_put_many(&mc.from->css, mc.moved_swap);
5799 css_put(&mc.from->css);
5800 5810
5801 /* we've already done css_get(mc.to) */ 5811 /* we've already done css_get(mc.to) */
5802 mc.moved_swap = 0; 5812 mc.moved_swap = 0;
5803 } 5813 }
5804 memcg_oom_recover(from); 5814 memcg_oom_recover(from);
5805 memcg_oom_recover(to); 5815 memcg_oom_recover(to);
5806 wake_up_all(&mc.waitq); 5816 wake_up_all(&mc.waitq);
5807 } 5817 }
5808 5818
5809 static void mem_cgroup_clear_mc(void) 5819 static void mem_cgroup_clear_mc(void)
5810 { 5820 {
5811 struct mem_cgroup *from = mc.from; 5821 struct mem_cgroup *from = mc.from;
5812 5822
5813 /* 5823 /*
5814 * we must clear moving_task before waking up waiters at the end of 5824 * we must clear moving_task before waking up waiters at the end of
5815 * task migration. 5825 * task migration.
5816 */ 5826 */
5817 mc.moving_task = NULL; 5827 mc.moving_task = NULL;
5818 __mem_cgroup_clear_mc(); 5828 __mem_cgroup_clear_mc();
5819 spin_lock(&mc.lock); 5829 spin_lock(&mc.lock);
5820 mc.from = NULL; 5830 mc.from = NULL;
5821 mc.to = NULL; 5831 mc.to = NULL;
5822 spin_unlock(&mc.lock); 5832 spin_unlock(&mc.lock);
5823 mem_cgroup_end_move(from); 5833 mem_cgroup_end_move(from);
5824 } 5834 }
5825 5835
5826 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 5836 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5827 struct cgroup_taskset *tset) 5837 struct cgroup_taskset *tset)
5828 { 5838 {
5829 struct task_struct *p = cgroup_taskset_first(tset); 5839 struct task_struct *p = cgroup_taskset_first(tset);
5830 int ret = 0; 5840 int ret = 0;
5831 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5841 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5832 unsigned long move_charge_at_immigrate; 5842 unsigned long move_charge_at_immigrate;
5833 5843
5834 /* 5844 /*
5835 * We are now commited to this value whatever it is. Changes in this 5845 * We are now commited to this value whatever it is. Changes in this
5836 * tunable will only affect upcoming migrations, not the current one. 5846 * tunable will only affect upcoming migrations, not the current one.
5837 * So we need to save it, and keep it going. 5847 * So we need to save it, and keep it going.
5838 */ 5848 */
5839 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 5849 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
5840 if (move_charge_at_immigrate) { 5850 if (move_charge_at_immigrate) {
5841 struct mm_struct *mm; 5851 struct mm_struct *mm;
5842 struct mem_cgroup *from = mem_cgroup_from_task(p); 5852 struct mem_cgroup *from = mem_cgroup_from_task(p);
5843 5853
5844 VM_BUG_ON(from == memcg); 5854 VM_BUG_ON(from == memcg);
5845 5855
5846 mm = get_task_mm(p); 5856 mm = get_task_mm(p);
5847 if (!mm) 5857 if (!mm)
5848 return 0; 5858 return 0;
5849 /* We move charges only when we move a owner of the mm */ 5859 /* We move charges only when we move a owner of the mm */
5850 if (mm->owner == p) { 5860 if (mm->owner == p) {
5851 VM_BUG_ON(mc.from); 5861 VM_BUG_ON(mc.from);
5852 VM_BUG_ON(mc.to); 5862 VM_BUG_ON(mc.to);
5853 VM_BUG_ON(mc.precharge); 5863 VM_BUG_ON(mc.precharge);
5854 VM_BUG_ON(mc.moved_charge); 5864 VM_BUG_ON(mc.moved_charge);
5855 VM_BUG_ON(mc.moved_swap); 5865 VM_BUG_ON(mc.moved_swap);
5856 mem_cgroup_start_move(from); 5866 mem_cgroup_start_move(from);
5857 spin_lock(&mc.lock); 5867 spin_lock(&mc.lock);
5858 mc.from = from; 5868 mc.from = from;
5859 mc.to = memcg; 5869 mc.to = memcg;
5860 mc.immigrate_flags = move_charge_at_immigrate; 5870 mc.immigrate_flags = move_charge_at_immigrate;
5861 spin_unlock(&mc.lock); 5871 spin_unlock(&mc.lock);
5862 /* We set mc.moving_task later */ 5872 /* We set mc.moving_task later */
5863 5873
5864 ret = mem_cgroup_precharge_mc(mm); 5874 ret = mem_cgroup_precharge_mc(mm);
5865 if (ret) 5875 if (ret)
5866 mem_cgroup_clear_mc(); 5876 mem_cgroup_clear_mc();
5867 } 5877 }
5868 mmput(mm); 5878 mmput(mm);
5869 } 5879 }
5870 return ret; 5880 return ret;
5871 } 5881 }
5872 5882
5873 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 5883 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
5874 struct cgroup_taskset *tset) 5884 struct cgroup_taskset *tset)
5875 { 5885 {
5876 mem_cgroup_clear_mc(); 5886 mem_cgroup_clear_mc();
5877 } 5887 }
5878 5888
5879 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5889 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5880 unsigned long addr, unsigned long end, 5890 unsigned long addr, unsigned long end,
5881 struct mm_walk *walk) 5891 struct mm_walk *walk)
5882 { 5892 {
5883 int ret = 0; 5893 int ret = 0;
5884 struct vm_area_struct *vma = walk->private; 5894 struct vm_area_struct *vma = walk->private;
5885 pte_t *pte; 5895 pte_t *pte;
5886 spinlock_t *ptl; 5896 spinlock_t *ptl;
5887 enum mc_target_type target_type; 5897 enum mc_target_type target_type;
5888 union mc_target target; 5898 union mc_target target;
5889 struct page *page; 5899 struct page *page;
5890 struct page_cgroup *pc; 5900 struct page_cgroup *pc;
5891 5901
5892 /* 5902 /*
5893 * We don't take compound_lock() here but no race with splitting thp 5903 * We don't take compound_lock() here but no race with splitting thp
5894 * happens because: 5904 * happens because:
5895 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 5905 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
5896 * under splitting, which means there's no concurrent thp split, 5906 * under splitting, which means there's no concurrent thp split,
5897 * - if another thread runs into split_huge_page() just after we 5907 * - if another thread runs into split_huge_page() just after we
5898 * entered this if-block, the thread must wait for page table lock 5908 * entered this if-block, the thread must wait for page table lock
5899 * to be unlocked in __split_huge_page_splitting(), where the main 5909 * to be unlocked in __split_huge_page_splitting(), where the main
5900 * part of thp split is not executed yet. 5910 * part of thp split is not executed yet.
5901 */ 5911 */
5902 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 5912 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
5903 if (mc.precharge < HPAGE_PMD_NR) { 5913 if (mc.precharge < HPAGE_PMD_NR) {
5904 spin_unlock(ptl); 5914 spin_unlock(ptl);
5905 return 0; 5915 return 0;
5906 } 5916 }
5907 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 5917 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5908 if (target_type == MC_TARGET_PAGE) { 5918 if (target_type == MC_TARGET_PAGE) {
5909 page = target.page; 5919 page = target.page;
5910 if (!isolate_lru_page(page)) { 5920 if (!isolate_lru_page(page)) {
5911 pc = lookup_page_cgroup(page); 5921 pc = lookup_page_cgroup(page);
5912 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5922 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5913 pc, mc.from, mc.to)) { 5923 pc, mc.from, mc.to)) {
5914 mc.precharge -= HPAGE_PMD_NR; 5924 mc.precharge -= HPAGE_PMD_NR;
5915 mc.moved_charge += HPAGE_PMD_NR; 5925 mc.moved_charge += HPAGE_PMD_NR;
5916 } 5926 }
5917 putback_lru_page(page); 5927 putback_lru_page(page);
5918 } 5928 }
5919 put_page(page); 5929 put_page(page);
5920 } 5930 }
5921 spin_unlock(ptl); 5931 spin_unlock(ptl);
5922 return 0; 5932 return 0;
5923 } 5933 }
5924 5934
5925 if (pmd_trans_unstable(pmd)) 5935 if (pmd_trans_unstable(pmd))
5926 return 0; 5936 return 0;
5927 retry: 5937 retry:
5928 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5938 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5929 for (; addr != end; addr += PAGE_SIZE) { 5939 for (; addr != end; addr += PAGE_SIZE) {
5930 pte_t ptent = *(pte++); 5940 pte_t ptent = *(pte++);
5931 swp_entry_t ent; 5941 swp_entry_t ent;
5932 5942
5933 if (!mc.precharge) 5943 if (!mc.precharge)
5934 break; 5944 break;
5935 5945
5936 switch (get_mctgt_type(vma, addr, ptent, &target)) { 5946 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5937 case MC_TARGET_PAGE: 5947 case MC_TARGET_PAGE:
5938 page = target.page; 5948 page = target.page;
5939 if (isolate_lru_page(page)) 5949 if (isolate_lru_page(page))
5940 goto put; 5950 goto put;
5941 pc = lookup_page_cgroup(page); 5951 pc = lookup_page_cgroup(page);
5942 if (!mem_cgroup_move_account(page, 1, pc, 5952 if (!mem_cgroup_move_account(page, 1, pc,
5943 mc.from, mc.to)) { 5953 mc.from, mc.to)) {
5944 mc.precharge--; 5954 mc.precharge--;
5945 /* we uncharge from mc.from later. */ 5955 /* we uncharge from mc.from later. */
5946 mc.moved_charge++; 5956 mc.moved_charge++;
5947 } 5957 }
5948 putback_lru_page(page); 5958 putback_lru_page(page);
5949 put: /* get_mctgt_type() gets the page */ 5959 put: /* get_mctgt_type() gets the page */
5950 put_page(page); 5960 put_page(page);
5951 break; 5961 break;
5952 case MC_TARGET_SWAP: 5962 case MC_TARGET_SWAP:
5953 ent = target.ent; 5963 ent = target.ent;
5954 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 5964 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5955 mc.precharge--; 5965 mc.precharge--;
5956 /* we fixup refcnts and charges later. */ 5966 /* we fixup refcnts and charges later. */
5957 mc.moved_swap++; 5967 mc.moved_swap++;
5958 } 5968 }
5959 break; 5969 break;
5960 default: 5970 default:
5961 break; 5971 break;
5962 } 5972 }
5963 } 5973 }
5964 pte_unmap_unlock(pte - 1, ptl); 5974 pte_unmap_unlock(pte - 1, ptl);
5965 cond_resched(); 5975 cond_resched();
5966 5976
5967 if (addr != end) { 5977 if (addr != end) {
5968 /* 5978 /*
5969 * We have consumed all precharges we got in can_attach(). 5979 * We have consumed all precharges we got in can_attach().
5970 * We try charge one by one, but don't do any additional 5980 * We try charge one by one, but don't do any additional
5971 * charges to mc.to if we have failed in charge once in attach() 5981 * charges to mc.to if we have failed in charge once in attach()
5972 * phase. 5982 * phase.
5973 */ 5983 */
5974 ret = mem_cgroup_do_precharge(1); 5984 ret = mem_cgroup_do_precharge(1);
5975 if (!ret) 5985 if (!ret)
5976 goto retry; 5986 goto retry;
5977 } 5987 }
5978 5988
5979 return ret; 5989 return ret;
5980 } 5990 }
5981 5991
5982 static void mem_cgroup_move_charge(struct mm_struct *mm) 5992 static void mem_cgroup_move_charge(struct mm_struct *mm)
5983 { 5993 {
5984 struct vm_area_struct *vma; 5994 struct vm_area_struct *vma;
5985 5995
5986 lru_add_drain_all(); 5996 lru_add_drain_all();
5987 retry: 5997 retry:
5988 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5998 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5989 /* 5999 /*
5990 * Someone who are holding the mmap_sem might be waiting in 6000 * Someone who are holding the mmap_sem might be waiting in
5991 * waitq. So we cancel all extra charges, wake up all waiters, 6001 * waitq. So we cancel all extra charges, wake up all waiters,
5992 * and retry. Because we cancel precharges, we might not be able 6002 * and retry. Because we cancel precharges, we might not be able
5993 * to move enough charges, but moving charge is a best-effort 6003 * to move enough charges, but moving charge is a best-effort
5994 * feature anyway, so it wouldn't be a big problem. 6004 * feature anyway, so it wouldn't be a big problem.
5995 */ 6005 */
5996 __mem_cgroup_clear_mc(); 6006 __mem_cgroup_clear_mc();
5997 cond_resched(); 6007 cond_resched();
5998 goto retry; 6008 goto retry;
5999 } 6009 }
6000 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6010 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6001 int ret; 6011 int ret;
6002 struct mm_walk mem_cgroup_move_charge_walk = { 6012 struct mm_walk mem_cgroup_move_charge_walk = {
6003 .pmd_entry = mem_cgroup_move_charge_pte_range, 6013 .pmd_entry = mem_cgroup_move_charge_pte_range,
6004 .mm = mm, 6014 .mm = mm,
6005 .private = vma, 6015 .private = vma,
6006 }; 6016 };
6007 if (is_vm_hugetlb_page(vma)) 6017 if (is_vm_hugetlb_page(vma))
6008 continue; 6018 continue;
6009 ret = walk_page_range(vma->vm_start, vma->vm_end, 6019 ret = walk_page_range(vma->vm_start, vma->vm_end,
6010 &mem_cgroup_move_charge_walk); 6020 &mem_cgroup_move_charge_walk);
6011 if (ret) 6021 if (ret)
6012 /* 6022 /*
6013 * means we have consumed all precharges and failed in 6023 * means we have consumed all precharges and failed in
6014 * doing additional charge. Just abandon here. 6024 * doing additional charge. Just abandon here.
6015 */ 6025 */
6016 break; 6026 break;
6017 } 6027 }
6018 up_read(&mm->mmap_sem); 6028 up_read(&mm->mmap_sem);
6019 } 6029 }
6020 6030
6021 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 6031 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6022 struct cgroup_taskset *tset) 6032 struct cgroup_taskset *tset)
6023 { 6033 {
6024 struct task_struct *p = cgroup_taskset_first(tset); 6034 struct task_struct *p = cgroup_taskset_first(tset);
6025 struct mm_struct *mm = get_task_mm(p); 6035 struct mm_struct *mm = get_task_mm(p);
6026 6036
6027 if (mm) { 6037 if (mm) {
6028 if (mc.to) 6038 if (mc.to)
6029 mem_cgroup_move_charge(mm); 6039 mem_cgroup_move_charge(mm);
6030 mmput(mm); 6040 mmput(mm);
6031 } 6041 }
6032 if (mc.to) 6042 if (mc.to)
6033 mem_cgroup_clear_mc(); 6043 mem_cgroup_clear_mc();
6034 } 6044 }
6035 #else /* !CONFIG_MMU */ 6045 #else /* !CONFIG_MMU */
6036 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 6046 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6037 struct cgroup_taskset *tset) 6047 struct cgroup_taskset *tset)
6038 { 6048 {
6039 return 0; 6049 return 0;
6040 } 6050 }
6041 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 6051 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6042 struct cgroup_taskset *tset) 6052 struct cgroup_taskset *tset)
6043 { 6053 {
6044 } 6054 }
6045 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 6055 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
6046 struct cgroup_taskset *tset) 6056 struct cgroup_taskset *tset)
6047 { 6057 {
6048 } 6058 }
6049 #endif 6059 #endif
6050 6060
6051 /* 6061 /*
6052 * Cgroup retains root cgroups across [un]mount cycles making it necessary 6062 * Cgroup retains root cgroups across [un]mount cycles making it necessary
6053 * to verify whether we're attached to the default hierarchy on each mount 6063 * to verify whether we're attached to the default hierarchy on each mount
6054 * attempt. 6064 * attempt.
6055 */ 6065 */
6056 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 6066 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
6057 { 6067 {
6058 /* 6068 /*
6059 * use_hierarchy is forced on the default hierarchy. cgroup core 6069 * use_hierarchy is forced on the default hierarchy. cgroup core
6060 * guarantees that @root doesn't have any children, so turning it 6070 * guarantees that @root doesn't have any children, so turning it
6061 * on for the root memcg is enough. 6071 * on for the root memcg is enough.
6062 */ 6072 */
6063 if (cgroup_on_dfl(root_css->cgroup)) 6073 if (cgroup_on_dfl(root_css->cgroup))
6064 mem_cgroup_from_css(root_css)->use_hierarchy = true; 6074 mem_cgroup_from_css(root_css)->use_hierarchy = true;
6065 } 6075 }
6066 6076
6067 struct cgroup_subsys memory_cgrp_subsys = { 6077 struct cgroup_subsys memory_cgrp_subsys = {
6068 .css_alloc = mem_cgroup_css_alloc, 6078 .css_alloc = mem_cgroup_css_alloc,
6069 .css_online = mem_cgroup_css_online, 6079 .css_online = mem_cgroup_css_online,
6070 .css_offline = mem_cgroup_css_offline, 6080 .css_offline = mem_cgroup_css_offline,
6071 .css_free = mem_cgroup_css_free, 6081 .css_free = mem_cgroup_css_free,
6072 .css_reset = mem_cgroup_css_reset, 6082 .css_reset = mem_cgroup_css_reset,
6073 .can_attach = mem_cgroup_can_attach, 6083 .can_attach = mem_cgroup_can_attach,
6074 .cancel_attach = mem_cgroup_cancel_attach, 6084 .cancel_attach = mem_cgroup_cancel_attach,
6075 .attach = mem_cgroup_move_task, 6085 .attach = mem_cgroup_move_task,
6076 .bind = mem_cgroup_bind, 6086 .bind = mem_cgroup_bind,
6077 .legacy_cftypes = mem_cgroup_files, 6087 .legacy_cftypes = mem_cgroup_files,
6078 .early_init = 0, 6088 .early_init = 0,
6079 }; 6089 };
6080 6090
6081 #ifdef CONFIG_MEMCG_SWAP 6091 #ifdef CONFIG_MEMCG_SWAP
6082 static int __init enable_swap_account(char *s) 6092 static int __init enable_swap_account(char *s)
6083 { 6093 {
6084 if (!strcmp(s, "1")) 6094 if (!strcmp(s, "1"))
6085 really_do_swap_account = 1; 6095 really_do_swap_account = 1;
6086 else if (!strcmp(s, "0")) 6096 else if (!strcmp(s, "0"))
6087 really_do_swap_account = 0; 6097 really_do_swap_account = 0;
6088 return 1; 6098 return 1;
6089 } 6099 }
6090 __setup("swapaccount=", enable_swap_account); 6100 __setup("swapaccount=", enable_swap_account);
6091 6101
6092 static void __init memsw_file_init(void) 6102 static void __init memsw_file_init(void)
6093 { 6103 {
6094 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 6104 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6095 memsw_cgroup_files)); 6105 memsw_cgroup_files));
6096 } 6106 }
6097 6107
6098 static void __init enable_swap_cgroup(void) 6108 static void __init enable_swap_cgroup(void)
6099 { 6109 {
6100 if (!mem_cgroup_disabled() && really_do_swap_account) { 6110 if (!mem_cgroup_disabled() && really_do_swap_account) {
6101 do_swap_account = 1; 6111 do_swap_account = 1;
6102 memsw_file_init(); 6112 memsw_file_init();
6103 } 6113 }
6104 } 6114 }
6105 6115
6106 #else 6116 #else
6107 static void __init enable_swap_cgroup(void) 6117 static void __init enable_swap_cgroup(void)
6108 { 6118 {
6109 } 6119 }
6110 #endif 6120 #endif
6111 6121
6112 #ifdef CONFIG_MEMCG_SWAP 6122 #ifdef CONFIG_MEMCG_SWAP
6113 /** 6123 /**
6114 * mem_cgroup_swapout - transfer a memsw charge to swap 6124 * mem_cgroup_swapout - transfer a memsw charge to swap
6115 * @page: page whose memsw charge to transfer 6125 * @page: page whose memsw charge to transfer
6116 * @entry: swap entry to move the charge to 6126 * @entry: swap entry to move the charge to
6117 * 6127 *
6118 * Transfer the memsw charge of @page to @entry. 6128 * Transfer the memsw charge of @page to @entry.
6119 */ 6129 */
6120 void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 6130 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
6121 { 6131 {
6122 struct page_cgroup *pc; 6132 struct page_cgroup *pc;
6123 unsigned short oldid; 6133 unsigned short oldid;
6124 6134
6125 VM_BUG_ON_PAGE(PageLRU(page), page); 6135 VM_BUG_ON_PAGE(PageLRU(page), page);
6126 VM_BUG_ON_PAGE(page_count(page), page); 6136 VM_BUG_ON_PAGE(page_count(page), page);
6127 6137
6128 if (!do_swap_account) 6138 if (!do_swap_account)
6129 return; 6139 return;
6130 6140
6131 pc = lookup_page_cgroup(page); 6141 pc = lookup_page_cgroup(page);
6132 6142
6133 /* Readahead page, never charged */ 6143 /* Readahead page, never charged */
6134 if (!PageCgroupUsed(pc)) 6144 if (!PageCgroupUsed(pc))
6135 return; 6145 return;
6136 6146
6137 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); 6147 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
6138 6148
6139 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); 6149 oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup));
6140 VM_BUG_ON_PAGE(oldid, page); 6150 VM_BUG_ON_PAGE(oldid, page);
6141 6151
6142 pc->flags &= ~PCG_MEMSW; 6152 pc->flags &= ~PCG_MEMSW;
6143 css_get(&pc->mem_cgroup->css); 6153 css_get(&pc->mem_cgroup->css);
6144 mem_cgroup_swap_statistics(pc->mem_cgroup, true); 6154 mem_cgroup_swap_statistics(pc->mem_cgroup, true);
6145 } 6155 }
6146 6156
6147 /** 6157 /**
6148 * mem_cgroup_uncharge_swap - uncharge a swap entry 6158 * mem_cgroup_uncharge_swap - uncharge a swap entry
6149 * @entry: swap entry to uncharge 6159 * @entry: swap entry to uncharge
6150 * 6160 *
6151 * Drop the memsw charge associated with @entry. 6161 * Drop the memsw charge associated with @entry.
6152 */ 6162 */
6153 void mem_cgroup_uncharge_swap(swp_entry_t entry) 6163 void mem_cgroup_uncharge_swap(swp_entry_t entry)
6154 { 6164 {
6155 struct mem_cgroup *memcg; 6165 struct mem_cgroup *memcg;
6156 unsigned short id; 6166 unsigned short id;
6157 6167
6158 if (!do_swap_account) 6168 if (!do_swap_account)
6159 return; 6169 return;
6160 6170
6161 id = swap_cgroup_record(entry, 0); 6171 id = swap_cgroup_record(entry, 0);
6162 rcu_read_lock(); 6172 rcu_read_lock();
6163 memcg = mem_cgroup_lookup(id); 6173 memcg = mem_cgroup_lookup(id);
6164 if (memcg) { 6174 if (memcg) {
6165 if (!mem_cgroup_is_root(memcg)) 6175 if (!mem_cgroup_is_root(memcg))
6166 page_counter_uncharge(&memcg->memsw, 1); 6176 page_counter_uncharge(&memcg->memsw, 1);
6167 mem_cgroup_swap_statistics(memcg, false); 6177 mem_cgroup_swap_statistics(memcg, false);
6168 css_put(&memcg->css); 6178 css_put(&memcg->css);
6169 } 6179 }
6170 rcu_read_unlock(); 6180 rcu_read_unlock();
6171 } 6181 }
6172 #endif 6182 #endif
6173 6183
6174 /** 6184 /**
6175 * mem_cgroup_try_charge - try charging a page 6185 * mem_cgroup_try_charge - try charging a page
6176 * @page: page to charge 6186 * @page: page to charge
6177 * @mm: mm context of the victim 6187 * @mm: mm context of the victim
6178 * @gfp_mask: reclaim mode 6188 * @gfp_mask: reclaim mode
6179 * @memcgp: charged memcg return 6189 * @memcgp: charged memcg return
6180 * 6190 *
6181 * Try to charge @page to the memcg that @mm belongs to, reclaiming 6191 * Try to charge @page to the memcg that @mm belongs to, reclaiming
6182 * pages according to @gfp_mask if necessary. 6192 * pages according to @gfp_mask if necessary.
6183 * 6193 *
6184 * Returns 0 on success, with *@memcgp pointing to the charged memcg. 6194 * Returns 0 on success, with *@memcgp pointing to the charged memcg.
6185 * Otherwise, an error code is returned. 6195 * Otherwise, an error code is returned.
6186 * 6196 *
6187 * After page->mapping has been set up, the caller must finalize the 6197 * After page->mapping has been set up, the caller must finalize the
6188 * charge with mem_cgroup_commit_charge(). Or abort the transaction 6198 * charge with mem_cgroup_commit_charge(). Or abort the transaction
6189 * with mem_cgroup_cancel_charge() in case page instantiation fails. 6199 * with mem_cgroup_cancel_charge() in case page instantiation fails.
6190 */ 6200 */
6191 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, 6201 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
6192 gfp_t gfp_mask, struct mem_cgroup **memcgp) 6202 gfp_t gfp_mask, struct mem_cgroup **memcgp)
6193 { 6203 {
6194 struct mem_cgroup *memcg = NULL; 6204 struct mem_cgroup *memcg = NULL;
6195 unsigned int nr_pages = 1; 6205 unsigned int nr_pages = 1;
6196 int ret = 0; 6206 int ret = 0;
6197 6207
6198 if (mem_cgroup_disabled()) 6208 if (mem_cgroup_disabled())
6199 goto out; 6209 goto out;
6200 6210
6201 if (PageSwapCache(page)) { 6211 if (PageSwapCache(page)) {
6202 struct page_cgroup *pc = lookup_page_cgroup(page); 6212 struct page_cgroup *pc = lookup_page_cgroup(page);
6203 /* 6213 /*
6204 * Every swap fault against a single page tries to charge the 6214 * Every swap fault against a single page tries to charge the
6205 * page, bail as early as possible. shmem_unuse() encounters 6215 * page, bail as early as possible. shmem_unuse() encounters
6206 * already charged pages, too. The USED bit is protected by 6216 * already charged pages, too. The USED bit is protected by
6207 * the page lock, which serializes swap cache removal, which 6217 * the page lock, which serializes swap cache removal, which
6208 * in turn serializes uncharging. 6218 * in turn serializes uncharging.
6209 */ 6219 */
6210 if (PageCgroupUsed(pc)) 6220 if (PageCgroupUsed(pc))
6211 goto out; 6221 goto out;
6212 } 6222 }
6213 6223
6214 if (PageTransHuge(page)) { 6224 if (PageTransHuge(page)) {
6215 nr_pages <<= compound_order(page); 6225 nr_pages <<= compound_order(page);
6216 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6226 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6217 } 6227 }
6218 6228
6219 if (do_swap_account && PageSwapCache(page)) 6229 if (do_swap_account && PageSwapCache(page))
6220 memcg = try_get_mem_cgroup_from_page(page); 6230 memcg = try_get_mem_cgroup_from_page(page);
6221 if (!memcg) 6231 if (!memcg)
6222 memcg = get_mem_cgroup_from_mm(mm); 6232 memcg = get_mem_cgroup_from_mm(mm);
6223 6233
6224 ret = try_charge(memcg, gfp_mask, nr_pages); 6234 ret = try_charge(memcg, gfp_mask, nr_pages);
6225 6235
6226 css_put(&memcg->css); 6236 css_put(&memcg->css);
6227 6237
6228 if (ret == -EINTR) { 6238 if (ret == -EINTR) {
6229 memcg = root_mem_cgroup; 6239 memcg = root_mem_cgroup;
6230 ret = 0; 6240 ret = 0;
6231 } 6241 }
6232 out: 6242 out:
6233 *memcgp = memcg; 6243 *memcgp = memcg;
6234 return ret; 6244 return ret;
6235 } 6245 }
6236 6246
6237 /** 6247 /**
6238 * mem_cgroup_commit_charge - commit a page charge 6248 * mem_cgroup_commit_charge - commit a page charge
6239 * @page: page to charge 6249 * @page: page to charge
6240 * @memcg: memcg to charge the page to 6250 * @memcg: memcg to charge the page to
6241 * @lrucare: page might be on LRU already 6251 * @lrucare: page might be on LRU already
6242 * 6252 *
6243 * Finalize a charge transaction started by mem_cgroup_try_charge(), 6253 * Finalize a charge transaction started by mem_cgroup_try_charge(),
6244 * after page->mapping has been set up. This must happen atomically 6254 * after page->mapping has been set up. This must happen atomically
6245 * as part of the page instantiation, i.e. under the page table lock 6255 * as part of the page instantiation, i.e. under the page table lock
6246 * for anonymous pages, under the page lock for page and swap cache. 6256 * for anonymous pages, under the page lock for page and swap cache.
6247 * 6257 *
6248 * In addition, the page must not be on the LRU during the commit, to 6258 * In addition, the page must not be on the LRU during the commit, to
6249 * prevent racing with task migration. If it might be, use @lrucare. 6259 * prevent racing with task migration. If it might be, use @lrucare.
6250 * 6260 *
6251 * Use mem_cgroup_cancel_charge() to cancel the transaction instead. 6261 * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
6252 */ 6262 */
6253 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, 6263 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
6254 bool lrucare) 6264 bool lrucare)
6255 { 6265 {
6256 unsigned int nr_pages = 1; 6266 unsigned int nr_pages = 1;
6257 6267
6258 VM_BUG_ON_PAGE(!page->mapping, page); 6268 VM_BUG_ON_PAGE(!page->mapping, page);
6259 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); 6269 VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
6260 6270
6261 if (mem_cgroup_disabled()) 6271 if (mem_cgroup_disabled())
6262 return; 6272 return;
6263 /* 6273 /*
6264 * Swap faults will attempt to charge the same page multiple 6274 * Swap faults will attempt to charge the same page multiple
6265 * times. But reuse_swap_page() might have removed the page 6275 * times. But reuse_swap_page() might have removed the page
6266 * from swapcache already, so we can't check PageSwapCache(). 6276 * from swapcache already, so we can't check PageSwapCache().
6267 */ 6277 */
6268 if (!memcg) 6278 if (!memcg)
6269 return; 6279 return;
6270 6280
6271 commit_charge(page, memcg, lrucare); 6281 commit_charge(page, memcg, lrucare);
6272 6282
6273 if (PageTransHuge(page)) { 6283 if (PageTransHuge(page)) {
6274 nr_pages <<= compound_order(page); 6284 nr_pages <<= compound_order(page);
6275 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6285 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6276 } 6286 }
6277 6287
6278 local_irq_disable(); 6288 local_irq_disable();
6279 mem_cgroup_charge_statistics(memcg, page, nr_pages); 6289 mem_cgroup_charge_statistics(memcg, page, nr_pages);
6280 memcg_check_events(memcg, page); 6290 memcg_check_events(memcg, page);
6281 local_irq_enable(); 6291 local_irq_enable();
6282 6292
6283 if (do_swap_account && PageSwapCache(page)) { 6293 if (do_swap_account && PageSwapCache(page)) {
6284 swp_entry_t entry = { .val = page_private(page) }; 6294 swp_entry_t entry = { .val = page_private(page) };
6285 /* 6295 /*
6286 * The swap entry might not get freed for a long time, 6296 * The swap entry might not get freed for a long time,
6287 * let's not wait for it. The page already received a 6297 * let's not wait for it. The page already received a
6288 * memory+swap charge, drop the swap entry duplicate. 6298 * memory+swap charge, drop the swap entry duplicate.
6289 */ 6299 */
6290 mem_cgroup_uncharge_swap(entry); 6300 mem_cgroup_uncharge_swap(entry);
6291 } 6301 }
6292 } 6302 }
6293 6303
6294 /** 6304 /**
6295 * mem_cgroup_cancel_charge - cancel a page charge 6305 * mem_cgroup_cancel_charge - cancel a page charge
6296 * @page: page to charge 6306 * @page: page to charge
6297 * @memcg: memcg to charge the page to 6307 * @memcg: memcg to charge the page to
6298 * 6308 *
6299 * Cancel a charge transaction started by mem_cgroup_try_charge(). 6309 * Cancel a charge transaction started by mem_cgroup_try_charge().
6300 */ 6310 */
6301 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) 6311 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
6302 { 6312 {
6303 unsigned int nr_pages = 1; 6313 unsigned int nr_pages = 1;
6304 6314
6305 if (mem_cgroup_disabled()) 6315 if (mem_cgroup_disabled())
6306 return; 6316 return;
6307 /* 6317 /*
6308 * Swap faults will attempt to charge the same page multiple 6318 * Swap faults will attempt to charge the same page multiple
6309 * times. But reuse_swap_page() might have removed the page 6319 * times. But reuse_swap_page() might have removed the page
6310 * from swapcache already, so we can't check PageSwapCache(). 6320 * from swapcache already, so we can't check PageSwapCache().
6311 */ 6321 */
6312 if (!memcg) 6322 if (!memcg)
6313 return; 6323 return;
6314 6324
6315 if (PageTransHuge(page)) { 6325 if (PageTransHuge(page)) {
6316 nr_pages <<= compound_order(page); 6326 nr_pages <<= compound_order(page);
6317 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6327 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6318 } 6328 }
6319 6329
6320 cancel_charge(memcg, nr_pages); 6330 cancel_charge(memcg, nr_pages);
6321 } 6331 }
6322 6332
6323 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, 6333 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
6324 unsigned long nr_mem, unsigned long nr_memsw, 6334 unsigned long nr_mem, unsigned long nr_memsw,
6325 unsigned long nr_anon, unsigned long nr_file, 6335 unsigned long nr_anon, unsigned long nr_file,
6326 unsigned long nr_huge, struct page *dummy_page) 6336 unsigned long nr_huge, struct page *dummy_page)
6327 { 6337 {
6328 unsigned long flags; 6338 unsigned long flags;
6329 6339
6330 if (!mem_cgroup_is_root(memcg)) { 6340 if (!mem_cgroup_is_root(memcg)) {
6331 if (nr_mem) 6341 if (nr_mem)
6332 page_counter_uncharge(&memcg->memory, nr_mem); 6342 page_counter_uncharge(&memcg->memory, nr_mem);
6333 if (nr_memsw) 6343 if (nr_memsw)
6334 page_counter_uncharge(&memcg->memsw, nr_memsw); 6344 page_counter_uncharge(&memcg->memsw, nr_memsw);
6335 memcg_oom_recover(memcg); 6345 memcg_oom_recover(memcg);
6336 } 6346 }
6337 6347
6338 local_irq_save(flags); 6348 local_irq_save(flags);
6339 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); 6349 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
6340 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); 6350 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
6341 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); 6351 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
6342 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); 6352 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
6343 __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); 6353 __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
6344 memcg_check_events(memcg, dummy_page); 6354 memcg_check_events(memcg, dummy_page);
6345 local_irq_restore(flags); 6355 local_irq_restore(flags);
6356
6357 if (!mem_cgroup_is_root(memcg))
6358 css_put_many(&memcg->css, max(nr_mem, nr_memsw));
6346 } 6359 }
6347 6360
6348 static void uncharge_list(struct list_head *page_list) 6361 static void uncharge_list(struct list_head *page_list)
6349 { 6362 {
6350 struct mem_cgroup *memcg = NULL; 6363 struct mem_cgroup *memcg = NULL;
6351 unsigned long nr_memsw = 0; 6364 unsigned long nr_memsw = 0;
6352 unsigned long nr_anon = 0; 6365 unsigned long nr_anon = 0;
6353 unsigned long nr_file = 0; 6366 unsigned long nr_file = 0;
6354 unsigned long nr_huge = 0; 6367 unsigned long nr_huge = 0;
6355 unsigned long pgpgout = 0; 6368 unsigned long pgpgout = 0;
6356 unsigned long nr_mem = 0; 6369 unsigned long nr_mem = 0;
6357 struct list_head *next; 6370 struct list_head *next;
6358 struct page *page; 6371 struct page *page;
6359 6372
6360 next = page_list->next; 6373 next = page_list->next;
6361 do { 6374 do {
6362 unsigned int nr_pages = 1; 6375 unsigned int nr_pages = 1;
6363 struct page_cgroup *pc; 6376 struct page_cgroup *pc;
6364 6377
6365 page = list_entry(next, struct page, lru); 6378 page = list_entry(next, struct page, lru);
6366 next = page->lru.next; 6379 next = page->lru.next;
6367 6380
6368 VM_BUG_ON_PAGE(PageLRU(page), page); 6381 VM_BUG_ON_PAGE(PageLRU(page), page);
6369 VM_BUG_ON_PAGE(page_count(page), page); 6382 VM_BUG_ON_PAGE(page_count(page), page);
6370 6383
6371 pc = lookup_page_cgroup(page); 6384 pc = lookup_page_cgroup(page);
6372 if (!PageCgroupUsed(pc)) 6385 if (!PageCgroupUsed(pc))
6373 continue; 6386 continue;
6374 6387
6375 /* 6388 /*
6376 * Nobody should be changing or seriously looking at 6389 * Nobody should be changing or seriously looking at
6377 * pc->mem_cgroup and pc->flags at this point, we have 6390 * pc->mem_cgroup and pc->flags at this point, we have
6378 * fully exclusive access to the page. 6391 * fully exclusive access to the page.
6379 */ 6392 */
6380 6393
6381 if (memcg != pc->mem_cgroup) { 6394 if (memcg != pc->mem_cgroup) {
6382 if (memcg) { 6395 if (memcg) {
6383 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 6396 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
6384 nr_anon, nr_file, nr_huge, page); 6397 nr_anon, nr_file, nr_huge, page);
6385 pgpgout = nr_mem = nr_memsw = 0; 6398 pgpgout = nr_mem = nr_memsw = 0;
6386 nr_anon = nr_file = nr_huge = 0; 6399 nr_anon = nr_file = nr_huge = 0;
6387 } 6400 }
6388 memcg = pc->mem_cgroup; 6401 memcg = pc->mem_cgroup;
6389 } 6402 }
6390 6403
6391 if (PageTransHuge(page)) { 6404 if (PageTransHuge(page)) {
6392 nr_pages <<= compound_order(page); 6405 nr_pages <<= compound_order(page);
6393 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 6406 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
6394 nr_huge += nr_pages; 6407 nr_huge += nr_pages;
6395 } 6408 }
6396 6409
6397 if (PageAnon(page)) 6410 if (PageAnon(page))
6398 nr_anon += nr_pages; 6411 nr_anon += nr_pages;
6399 else 6412 else
6400 nr_file += nr_pages; 6413 nr_file += nr_pages;
6401 6414
6402 if (pc->flags & PCG_MEM) 6415 if (pc->flags & PCG_MEM)
6403 nr_mem += nr_pages; 6416 nr_mem += nr_pages;
6404 if (pc->flags & PCG_MEMSW) 6417 if (pc->flags & PCG_MEMSW)
6405 nr_memsw += nr_pages; 6418 nr_memsw += nr_pages;
6406 pc->flags = 0; 6419 pc->flags = 0;
6407 6420
6408 pgpgout++; 6421 pgpgout++;
6409 } while (next != page_list); 6422 } while (next != page_list);
6410 6423
6411 if (memcg) 6424 if (memcg)
6412 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, 6425 uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
6413 nr_anon, nr_file, nr_huge, page); 6426 nr_anon, nr_file, nr_huge, page);
6414 } 6427 }
6415 6428
6416 /** 6429 /**
6417 * mem_cgroup_uncharge - uncharge a page 6430 * mem_cgroup_uncharge - uncharge a page
6418 * @page: page to uncharge 6431 * @page: page to uncharge
6419 * 6432 *
6420 * Uncharge a page previously charged with mem_cgroup_try_charge() and 6433 * Uncharge a page previously charged with mem_cgroup_try_charge() and
6421 * mem_cgroup_commit_charge(). 6434 * mem_cgroup_commit_charge().
6422 */ 6435 */
6423 void mem_cgroup_uncharge(struct page *page) 6436 void mem_cgroup_uncharge(struct page *page)
6424 { 6437 {
6425 struct page_cgroup *pc; 6438 struct page_cgroup *pc;
6426 6439
6427 if (mem_cgroup_disabled()) 6440 if (mem_cgroup_disabled())
6428 return; 6441 return;
6429 6442
6430 /* Don't touch page->lru of any random page, pre-check: */ 6443 /* Don't touch page->lru of any random page, pre-check: */
6431 pc = lookup_page_cgroup(page); 6444 pc = lookup_page_cgroup(page);
6432 if (!PageCgroupUsed(pc)) 6445 if (!PageCgroupUsed(pc))
6433 return; 6446 return;
6434 6447
6435 INIT_LIST_HEAD(&page->lru); 6448 INIT_LIST_HEAD(&page->lru);
6436 uncharge_list(&page->lru); 6449 uncharge_list(&page->lru);
6437 } 6450 }
6438 6451
6439 /** 6452 /**
6440 * mem_cgroup_uncharge_list - uncharge a list of page 6453 * mem_cgroup_uncharge_list - uncharge a list of page
6441 * @page_list: list of pages to uncharge 6454 * @page_list: list of pages to uncharge
6442 * 6455 *
6443 * Uncharge a list of pages previously charged with 6456 * Uncharge a list of pages previously charged with
6444 * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). 6457 * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
6445 */ 6458 */
6446 void mem_cgroup_uncharge_list(struct list_head *page_list) 6459 void mem_cgroup_uncharge_list(struct list_head *page_list)
6447 { 6460 {
6448 if (mem_cgroup_disabled()) 6461 if (mem_cgroup_disabled())
6449 return; 6462 return;
6450 6463
6451 if (!list_empty(page_list)) 6464 if (!list_empty(page_list))
6452 uncharge_list(page_list); 6465 uncharge_list(page_list);
6453 } 6466 }
6454 6467
6455 /** 6468 /**
6456 * mem_cgroup_migrate - migrate a charge to another page 6469 * mem_cgroup_migrate - migrate a charge to another page
6457 * @oldpage: currently charged page 6470 * @oldpage: currently charged page
6458 * @newpage: page to transfer the charge to 6471 * @newpage: page to transfer the charge to
6459 * @lrucare: both pages might be on the LRU already 6472 * @lrucare: both pages might be on the LRU already
6460 * 6473 *
6461 * Migrate the charge from @oldpage to @newpage. 6474 * Migrate the charge from @oldpage to @newpage.
6462 * 6475 *
6463 * Both pages must be locked, @newpage->mapping must be set up. 6476 * Both pages must be locked, @newpage->mapping must be set up.
6464 */ 6477 */
6465 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, 6478 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
6466 bool lrucare) 6479 bool lrucare)
6467 { 6480 {
6468 struct page_cgroup *pc; 6481 struct page_cgroup *pc;
6469 int isolated; 6482 int isolated;
6470 6483
6471 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 6484 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6472 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 6485 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6473 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); 6486 VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
6474 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); 6487 VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
6475 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); 6488 VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6476 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), 6489 VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6477 newpage); 6490 newpage);
6478 6491
6479 if (mem_cgroup_disabled()) 6492 if (mem_cgroup_disabled())
6480 return; 6493 return;
6481 6494
6482 /* Page cache replacement: new page already charged? */ 6495 /* Page cache replacement: new page already charged? */
6483 pc = lookup_page_cgroup(newpage); 6496 pc = lookup_page_cgroup(newpage);
6484 if (PageCgroupUsed(pc)) 6497 if (PageCgroupUsed(pc))
6485 return; 6498 return;
6486 6499
6487 /* Re-entrant migration: old page already uncharged? */ 6500 /* Re-entrant migration: old page already uncharged? */
6488 pc = lookup_page_cgroup(oldpage); 6501 pc = lookup_page_cgroup(oldpage);
6489 if (!PageCgroupUsed(pc)) 6502 if (!PageCgroupUsed(pc))
6490 return; 6503 return;
6491 6504
6492 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); 6505 VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
6493 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); 6506 VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
6494 6507
6495 if (lrucare) 6508 if (lrucare)
6496 lock_page_lru(oldpage, &isolated); 6509 lock_page_lru(oldpage, &isolated);
6497 6510
6498 pc->flags = 0; 6511 pc->flags = 0;
6499 6512
6500 if (lrucare) 6513 if (lrucare)
6501 unlock_page_lru(oldpage, isolated); 6514 unlock_page_lru(oldpage, isolated);
6502 6515
6503 commit_charge(newpage, pc->mem_cgroup, lrucare); 6516 commit_charge(newpage, pc->mem_cgroup, lrucare);
6504 } 6517 }
6505 6518
6506 /* 6519 /*
6507 * subsys_initcall() for memory controller. 6520 * subsys_initcall() for memory controller.
6508 * 6521 *
6509 * Some parts like hotcpu_notifier() have to be initialized from this context 6522 * Some parts like hotcpu_notifier() have to be initialized from this context
6510 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 6523 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
6511 * everything that doesn't depend on a specific mem_cgroup structure should 6524 * everything that doesn't depend on a specific mem_cgroup structure should
6512 * be initialized from here. 6525 * be initialized from here.
6513 */ 6526 */
6514 static int __init mem_cgroup_init(void) 6527 static int __init mem_cgroup_init(void)
6515 { 6528 {
6516 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 6529 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
6517 enable_swap_cgroup(); 6530 enable_swap_cgroup();
6518 mem_cgroup_soft_limit_tree_init(); 6531 mem_cgroup_soft_limit_tree_init();
6519 memcg_stock_init(); 6532 memcg_stock_init();
6520 return 0; 6533 return 0;
6521 } 6534 }