Commit e8ea14cc6eadfe2ea63e9989e16e62625a2619f8
Committed by
Linus Torvalds
1 parent
5ac8fb31ad
Exists in
ti-lsk-linux-4.1.y
and in
10 other branches
mm: memcontrol: take a css reference for each charged page
Charges currently pin the css indirectly by playing tricks during css_offline(): user pages stall the offlining process until all of them have been reparented, whereas kmemcg acquires a keep-alive reference if outstanding kernel pages are detected at that point. In preparation for removing all this complexity, make the pinning explicit and acquire a css references for every charged page. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: David Rientjes <rientjes@google.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 3 changed files with 81 additions and 13 deletions Inline Diff
include/linux/cgroup.h
1 | #ifndef _LINUX_CGROUP_H | 1 | #ifndef _LINUX_CGROUP_H |
2 | #define _LINUX_CGROUP_H | 2 | #define _LINUX_CGROUP_H |
3 | /* | 3 | /* |
4 | * cgroup interface | 4 | * cgroup interface |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA | 6 | * Copyright (C) 2003 BULL SA |
7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
12 | #include <linux/cpumask.h> | 12 | #include <linux/cpumask.h> |
13 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
14 | #include <linux/rcupdate.h> | 14 | #include <linux/rcupdate.h> |
15 | #include <linux/rculist.h> | 15 | #include <linux/rculist.h> |
16 | #include <linux/cgroupstats.h> | 16 | #include <linux/cgroupstats.h> |
17 | #include <linux/rwsem.h> | 17 | #include <linux/rwsem.h> |
18 | #include <linux/idr.h> | 18 | #include <linux/idr.h> |
19 | #include <linux/workqueue.h> | 19 | #include <linux/workqueue.h> |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/percpu-refcount.h> | 21 | #include <linux/percpu-refcount.h> |
22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
23 | #include <linux/kernfs.h> | 23 | #include <linux/kernfs.h> |
24 | #include <linux/wait.h> | 24 | #include <linux/wait.h> |
25 | 25 | ||
26 | #ifdef CONFIG_CGROUPS | 26 | #ifdef CONFIG_CGROUPS |
27 | 27 | ||
28 | struct cgroup_root; | 28 | struct cgroup_root; |
29 | struct cgroup_subsys; | 29 | struct cgroup_subsys; |
30 | struct cgroup; | 30 | struct cgroup; |
31 | 31 | ||
32 | extern int cgroup_init_early(void); | 32 | extern int cgroup_init_early(void); |
33 | extern int cgroup_init(void); | 33 | extern int cgroup_init(void); |
34 | extern void cgroup_fork(struct task_struct *p); | 34 | extern void cgroup_fork(struct task_struct *p); |
35 | extern void cgroup_post_fork(struct task_struct *p); | 35 | extern void cgroup_post_fork(struct task_struct *p); |
36 | extern void cgroup_exit(struct task_struct *p); | 36 | extern void cgroup_exit(struct task_struct *p); |
37 | extern int cgroupstats_build(struct cgroupstats *stats, | 37 | extern int cgroupstats_build(struct cgroupstats *stats, |
38 | struct dentry *dentry); | 38 | struct dentry *dentry); |
39 | 39 | ||
40 | extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, | 40 | extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, |
41 | struct pid *pid, struct task_struct *tsk); | 41 | struct pid *pid, struct task_struct *tsk); |
42 | 42 | ||
43 | /* define the enumeration of all cgroup subsystems */ | 43 | /* define the enumeration of all cgroup subsystems */ |
44 | #define SUBSYS(_x) _x ## _cgrp_id, | 44 | #define SUBSYS(_x) _x ## _cgrp_id, |
45 | enum cgroup_subsys_id { | 45 | enum cgroup_subsys_id { |
46 | #include <linux/cgroup_subsys.h> | 46 | #include <linux/cgroup_subsys.h> |
47 | CGROUP_SUBSYS_COUNT, | 47 | CGROUP_SUBSYS_COUNT, |
48 | }; | 48 | }; |
49 | #undef SUBSYS | 49 | #undef SUBSYS |
50 | 50 | ||
51 | /* | 51 | /* |
52 | * Per-subsystem/per-cgroup state maintained by the system. This is the | 52 | * Per-subsystem/per-cgroup state maintained by the system. This is the |
53 | * fundamental structural building block that controllers deal with. | 53 | * fundamental structural building block that controllers deal with. |
54 | * | 54 | * |
55 | * Fields marked with "PI:" are public and immutable and may be accessed | 55 | * Fields marked with "PI:" are public and immutable and may be accessed |
56 | * directly without synchronization. | 56 | * directly without synchronization. |
57 | */ | 57 | */ |
58 | struct cgroup_subsys_state { | 58 | struct cgroup_subsys_state { |
59 | /* PI: the cgroup that this css is attached to */ | 59 | /* PI: the cgroup that this css is attached to */ |
60 | struct cgroup *cgroup; | 60 | struct cgroup *cgroup; |
61 | 61 | ||
62 | /* PI: the cgroup subsystem that this css is attached to */ | 62 | /* PI: the cgroup subsystem that this css is attached to */ |
63 | struct cgroup_subsys *ss; | 63 | struct cgroup_subsys *ss; |
64 | 64 | ||
65 | /* reference count - access via css_[try]get() and css_put() */ | 65 | /* reference count - access via css_[try]get() and css_put() */ |
66 | struct percpu_ref refcnt; | 66 | struct percpu_ref refcnt; |
67 | 67 | ||
68 | /* PI: the parent css */ | 68 | /* PI: the parent css */ |
69 | struct cgroup_subsys_state *parent; | 69 | struct cgroup_subsys_state *parent; |
70 | 70 | ||
71 | /* siblings list anchored at the parent's ->children */ | 71 | /* siblings list anchored at the parent's ->children */ |
72 | struct list_head sibling; | 72 | struct list_head sibling; |
73 | struct list_head children; | 73 | struct list_head children; |
74 | 74 | ||
75 | /* | 75 | /* |
76 | * PI: Subsys-unique ID. 0 is unused and root is always 1. The | 76 | * PI: Subsys-unique ID. 0 is unused and root is always 1. The |
77 | * matching css can be looked up using css_from_id(). | 77 | * matching css can be looked up using css_from_id(). |
78 | */ | 78 | */ |
79 | int id; | 79 | int id; |
80 | 80 | ||
81 | unsigned int flags; | 81 | unsigned int flags; |
82 | 82 | ||
83 | /* | 83 | /* |
84 | * Monotonically increasing unique serial number which defines a | 84 | * Monotonically increasing unique serial number which defines a |
85 | * uniform order among all csses. It's guaranteed that all | 85 | * uniform order among all csses. It's guaranteed that all |
86 | * ->children lists are in the ascending order of ->serial_nr and | 86 | * ->children lists are in the ascending order of ->serial_nr and |
87 | * used to allow interrupting and resuming iterations. | 87 | * used to allow interrupting and resuming iterations. |
88 | */ | 88 | */ |
89 | u64 serial_nr; | 89 | u64 serial_nr; |
90 | 90 | ||
91 | /* percpu_ref killing and RCU release */ | 91 | /* percpu_ref killing and RCU release */ |
92 | struct rcu_head rcu_head; | 92 | struct rcu_head rcu_head; |
93 | struct work_struct destroy_work; | 93 | struct work_struct destroy_work; |
94 | }; | 94 | }; |
95 | 95 | ||
96 | /* bits in struct cgroup_subsys_state flags field */ | 96 | /* bits in struct cgroup_subsys_state flags field */ |
97 | enum { | 97 | enum { |
98 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ | 98 | CSS_NO_REF = (1 << 0), /* no reference counting for this css */ |
99 | CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ | 99 | CSS_ONLINE = (1 << 1), /* between ->css_online() and ->css_offline() */ |
100 | CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ | 100 | CSS_RELEASED = (1 << 2), /* refcnt reached zero, released */ |
101 | }; | 101 | }; |
102 | 102 | ||
103 | /** | 103 | /** |
104 | * css_get - obtain a reference on the specified css | 104 | * css_get - obtain a reference on the specified css |
105 | * @css: target css | 105 | * @css: target css |
106 | * | 106 | * |
107 | * The caller must already have a reference. | 107 | * The caller must already have a reference. |
108 | */ | 108 | */ |
109 | static inline void css_get(struct cgroup_subsys_state *css) | 109 | static inline void css_get(struct cgroup_subsys_state *css) |
110 | { | 110 | { |
111 | if (!(css->flags & CSS_NO_REF)) | 111 | if (!(css->flags & CSS_NO_REF)) |
112 | percpu_ref_get(&css->refcnt); | 112 | percpu_ref_get(&css->refcnt); |
113 | } | 113 | } |
114 | 114 | ||
115 | /** | 115 | /** |
116 | * css_get_many - obtain references on the specified css | ||
117 | * @css: target css | ||
118 | * @n: number of references to get | ||
119 | * | ||
120 | * The caller must already have a reference. | ||
121 | */ | ||
122 | static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) | ||
123 | { | ||
124 | if (!(css->flags & CSS_NO_REF)) | ||
125 | percpu_ref_get_many(&css->refcnt, n); | ||
126 | } | ||
127 | |||
128 | /** | ||
116 | * css_tryget - try to obtain a reference on the specified css | 129 | * css_tryget - try to obtain a reference on the specified css |
117 | * @css: target css | 130 | * @css: target css |
118 | * | 131 | * |
119 | * Obtain a reference on @css unless it already has reached zero and is | 132 | * Obtain a reference on @css unless it already has reached zero and is |
120 | * being released. This function doesn't care whether @css is on or | 133 | * being released. This function doesn't care whether @css is on or |
121 | * offline. The caller naturally needs to ensure that @css is accessible | 134 | * offline. The caller naturally needs to ensure that @css is accessible |
122 | * but doesn't have to be holding a reference on it - IOW, RCU protected | 135 | * but doesn't have to be holding a reference on it - IOW, RCU protected |
123 | * access is good enough for this function. Returns %true if a reference | 136 | * access is good enough for this function. Returns %true if a reference |
124 | * count was successfully obtained; %false otherwise. | 137 | * count was successfully obtained; %false otherwise. |
125 | */ | 138 | */ |
126 | static inline bool css_tryget(struct cgroup_subsys_state *css) | 139 | static inline bool css_tryget(struct cgroup_subsys_state *css) |
127 | { | 140 | { |
128 | if (!(css->flags & CSS_NO_REF)) | 141 | if (!(css->flags & CSS_NO_REF)) |
129 | return percpu_ref_tryget(&css->refcnt); | 142 | return percpu_ref_tryget(&css->refcnt); |
130 | return true; | 143 | return true; |
131 | } | 144 | } |
132 | 145 | ||
133 | /** | 146 | /** |
134 | * css_tryget_online - try to obtain a reference on the specified css if online | 147 | * css_tryget_online - try to obtain a reference on the specified css if online |
135 | * @css: target css | 148 | * @css: target css |
136 | * | 149 | * |
137 | * Obtain a reference on @css if it's online. The caller naturally needs | 150 | * Obtain a reference on @css if it's online. The caller naturally needs |
138 | * to ensure that @css is accessible but doesn't have to be holding a | 151 | * to ensure that @css is accessible but doesn't have to be holding a |
139 | * reference on it - IOW, RCU protected access is good enough for this | 152 | * reference on it - IOW, RCU protected access is good enough for this |
140 | * function. Returns %true if a reference count was successfully obtained; | 153 | * function. Returns %true if a reference count was successfully obtained; |
141 | * %false otherwise. | 154 | * %false otherwise. |
142 | */ | 155 | */ |
143 | static inline bool css_tryget_online(struct cgroup_subsys_state *css) | 156 | static inline bool css_tryget_online(struct cgroup_subsys_state *css) |
144 | { | 157 | { |
145 | if (!(css->flags & CSS_NO_REF)) | 158 | if (!(css->flags & CSS_NO_REF)) |
146 | return percpu_ref_tryget_live(&css->refcnt); | 159 | return percpu_ref_tryget_live(&css->refcnt); |
147 | return true; | 160 | return true; |
148 | } | 161 | } |
149 | 162 | ||
150 | /** | 163 | /** |
151 | * css_put - put a css reference | 164 | * css_put - put a css reference |
152 | * @css: target css | 165 | * @css: target css |
153 | * | 166 | * |
154 | * Put a reference obtained via css_get() and css_tryget_online(). | 167 | * Put a reference obtained via css_get() and css_tryget_online(). |
155 | */ | 168 | */ |
156 | static inline void css_put(struct cgroup_subsys_state *css) | 169 | static inline void css_put(struct cgroup_subsys_state *css) |
157 | { | 170 | { |
158 | if (!(css->flags & CSS_NO_REF)) | 171 | if (!(css->flags & CSS_NO_REF)) |
159 | percpu_ref_put(&css->refcnt); | 172 | percpu_ref_put(&css->refcnt); |
173 | } | ||
174 | |||
175 | /** | ||
176 | * css_put_many - put css references | ||
177 | * @css: target css | ||
178 | * @n: number of references to put | ||
179 | * | ||
180 | * Put references obtained via css_get() and css_tryget_online(). | ||
181 | */ | ||
182 | static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) | ||
183 | { | ||
184 | if (!(css->flags & CSS_NO_REF)) | ||
185 | percpu_ref_put_many(&css->refcnt, n); | ||
160 | } | 186 | } |
161 | 187 | ||
162 | /* bits in struct cgroup flags field */ | 188 | /* bits in struct cgroup flags field */ |
163 | enum { | 189 | enum { |
164 | /* Control Group requires release notifications to userspace */ | 190 | /* Control Group requires release notifications to userspace */ |
165 | CGRP_NOTIFY_ON_RELEASE, | 191 | CGRP_NOTIFY_ON_RELEASE, |
166 | /* | 192 | /* |
167 | * Clone the parent's configuration when creating a new child | 193 | * Clone the parent's configuration when creating a new child |
168 | * cpuset cgroup. For historical reasons, this option can be | 194 | * cpuset cgroup. For historical reasons, this option can be |
169 | * specified at mount time and thus is implemented here. | 195 | * specified at mount time and thus is implemented here. |
170 | */ | 196 | */ |
171 | CGRP_CPUSET_CLONE_CHILDREN, | 197 | CGRP_CPUSET_CLONE_CHILDREN, |
172 | }; | 198 | }; |
173 | 199 | ||
174 | struct cgroup { | 200 | struct cgroup { |
175 | /* self css with NULL ->ss, points back to this cgroup */ | 201 | /* self css with NULL ->ss, points back to this cgroup */ |
176 | struct cgroup_subsys_state self; | 202 | struct cgroup_subsys_state self; |
177 | 203 | ||
178 | unsigned long flags; /* "unsigned long" so bitops work */ | 204 | unsigned long flags; /* "unsigned long" so bitops work */ |
179 | 205 | ||
180 | /* | 206 | /* |
181 | * idr allocated in-hierarchy ID. | 207 | * idr allocated in-hierarchy ID. |
182 | * | 208 | * |
183 | * ID 0 is not used, the ID of the root cgroup is always 1, and a | 209 | * ID 0 is not used, the ID of the root cgroup is always 1, and a |
184 | * new cgroup will be assigned with a smallest available ID. | 210 | * new cgroup will be assigned with a smallest available ID. |
185 | * | 211 | * |
186 | * Allocating/Removing ID must be protected by cgroup_mutex. | 212 | * Allocating/Removing ID must be protected by cgroup_mutex. |
187 | */ | 213 | */ |
188 | int id; | 214 | int id; |
189 | 215 | ||
190 | /* | 216 | /* |
191 | * If this cgroup contains any tasks, it contributes one to | 217 | * If this cgroup contains any tasks, it contributes one to |
192 | * populated_cnt. All children with non-zero popuplated_cnt of | 218 | * populated_cnt. All children with non-zero popuplated_cnt of |
193 | * their own contribute one. The count is zero iff there's no task | 219 | * their own contribute one. The count is zero iff there's no task |
194 | * in this cgroup or its subtree. | 220 | * in this cgroup or its subtree. |
195 | */ | 221 | */ |
196 | int populated_cnt; | 222 | int populated_cnt; |
197 | 223 | ||
198 | struct kernfs_node *kn; /* cgroup kernfs entry */ | 224 | struct kernfs_node *kn; /* cgroup kernfs entry */ |
199 | struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ | 225 | struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ |
200 | 226 | ||
201 | /* | 227 | /* |
202 | * The bitmask of subsystems enabled on the child cgroups. | 228 | * The bitmask of subsystems enabled on the child cgroups. |
203 | * ->subtree_control is the one configured through | 229 | * ->subtree_control is the one configured through |
204 | * "cgroup.subtree_control" while ->child_subsys_mask is the | 230 | * "cgroup.subtree_control" while ->child_subsys_mask is the |
205 | * effective one which may have more subsystems enabled. | 231 | * effective one which may have more subsystems enabled. |
206 | * Controller knobs are made available iff it's enabled in | 232 | * Controller knobs are made available iff it's enabled in |
207 | * ->subtree_control. | 233 | * ->subtree_control. |
208 | */ | 234 | */ |
209 | unsigned int subtree_control; | 235 | unsigned int subtree_control; |
210 | unsigned int child_subsys_mask; | 236 | unsigned int child_subsys_mask; |
211 | 237 | ||
212 | /* Private pointers for each registered subsystem */ | 238 | /* Private pointers for each registered subsystem */ |
213 | struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; | 239 | struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; |
214 | 240 | ||
215 | struct cgroup_root *root; | 241 | struct cgroup_root *root; |
216 | 242 | ||
217 | /* | 243 | /* |
218 | * List of cgrp_cset_links pointing at css_sets with tasks in this | 244 | * List of cgrp_cset_links pointing at css_sets with tasks in this |
219 | * cgroup. Protected by css_set_lock. | 245 | * cgroup. Protected by css_set_lock. |
220 | */ | 246 | */ |
221 | struct list_head cset_links; | 247 | struct list_head cset_links; |
222 | 248 | ||
223 | /* | 249 | /* |
224 | * On the default hierarchy, a css_set for a cgroup with some | 250 | * On the default hierarchy, a css_set for a cgroup with some |
225 | * susbsys disabled will point to css's which are associated with | 251 | * susbsys disabled will point to css's which are associated with |
226 | * the closest ancestor which has the subsys enabled. The | 252 | * the closest ancestor which has the subsys enabled. The |
227 | * following lists all css_sets which point to this cgroup's css | 253 | * following lists all css_sets which point to this cgroup's css |
228 | * for the given subsystem. | 254 | * for the given subsystem. |
229 | */ | 255 | */ |
230 | struct list_head e_csets[CGROUP_SUBSYS_COUNT]; | 256 | struct list_head e_csets[CGROUP_SUBSYS_COUNT]; |
231 | 257 | ||
232 | /* | 258 | /* |
233 | * list of pidlists, up to two for each namespace (one for procs, one | 259 | * list of pidlists, up to two for each namespace (one for procs, one |
234 | * for tasks); created on demand. | 260 | * for tasks); created on demand. |
235 | */ | 261 | */ |
236 | struct list_head pidlists; | 262 | struct list_head pidlists; |
237 | struct mutex pidlist_mutex; | 263 | struct mutex pidlist_mutex; |
238 | 264 | ||
239 | /* used to wait for offlining of csses */ | 265 | /* used to wait for offlining of csses */ |
240 | wait_queue_head_t offline_waitq; | 266 | wait_queue_head_t offline_waitq; |
241 | 267 | ||
242 | /* used to schedule release agent */ | 268 | /* used to schedule release agent */ |
243 | struct work_struct release_agent_work; | 269 | struct work_struct release_agent_work; |
244 | }; | 270 | }; |
245 | 271 | ||
246 | #define MAX_CGROUP_ROOT_NAMELEN 64 | 272 | #define MAX_CGROUP_ROOT_NAMELEN 64 |
247 | 273 | ||
248 | /* cgroup_root->flags */ | 274 | /* cgroup_root->flags */ |
249 | enum { | 275 | enum { |
250 | CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ | 276 | CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ |
251 | CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ | 277 | CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ |
252 | CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ | 278 | CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ |
253 | }; | 279 | }; |
254 | 280 | ||
255 | /* | 281 | /* |
256 | * A cgroup_root represents the root of a cgroup hierarchy, and may be | 282 | * A cgroup_root represents the root of a cgroup hierarchy, and may be |
257 | * associated with a kernfs_root to form an active hierarchy. This is | 283 | * associated with a kernfs_root to form an active hierarchy. This is |
258 | * internal to cgroup core. Don't access directly from controllers. | 284 | * internal to cgroup core. Don't access directly from controllers. |
259 | */ | 285 | */ |
260 | struct cgroup_root { | 286 | struct cgroup_root { |
261 | struct kernfs_root *kf_root; | 287 | struct kernfs_root *kf_root; |
262 | 288 | ||
263 | /* The bitmask of subsystems attached to this hierarchy */ | 289 | /* The bitmask of subsystems attached to this hierarchy */ |
264 | unsigned int subsys_mask; | 290 | unsigned int subsys_mask; |
265 | 291 | ||
266 | /* Unique id for this hierarchy. */ | 292 | /* Unique id for this hierarchy. */ |
267 | int hierarchy_id; | 293 | int hierarchy_id; |
268 | 294 | ||
269 | /* The root cgroup. Root is destroyed on its release. */ | 295 | /* The root cgroup. Root is destroyed on its release. */ |
270 | struct cgroup cgrp; | 296 | struct cgroup cgrp; |
271 | 297 | ||
272 | /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ | 298 | /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ |
273 | atomic_t nr_cgrps; | 299 | atomic_t nr_cgrps; |
274 | 300 | ||
275 | /* A list running through the active hierarchies */ | 301 | /* A list running through the active hierarchies */ |
276 | struct list_head root_list; | 302 | struct list_head root_list; |
277 | 303 | ||
278 | /* Hierarchy-specific flags */ | 304 | /* Hierarchy-specific flags */ |
279 | unsigned int flags; | 305 | unsigned int flags; |
280 | 306 | ||
281 | /* IDs for cgroups in this hierarchy */ | 307 | /* IDs for cgroups in this hierarchy */ |
282 | struct idr cgroup_idr; | 308 | struct idr cgroup_idr; |
283 | 309 | ||
284 | /* The path to use for release notifications. */ | 310 | /* The path to use for release notifications. */ |
285 | char release_agent_path[PATH_MAX]; | 311 | char release_agent_path[PATH_MAX]; |
286 | 312 | ||
287 | /* The name for this hierarchy - may be empty */ | 313 | /* The name for this hierarchy - may be empty */ |
288 | char name[MAX_CGROUP_ROOT_NAMELEN]; | 314 | char name[MAX_CGROUP_ROOT_NAMELEN]; |
289 | }; | 315 | }; |
290 | 316 | ||
291 | /* | 317 | /* |
292 | * A css_set is a structure holding pointers to a set of | 318 | * A css_set is a structure holding pointers to a set of |
293 | * cgroup_subsys_state objects. This saves space in the task struct | 319 | * cgroup_subsys_state objects. This saves space in the task struct |
294 | * object and speeds up fork()/exit(), since a single inc/dec and a | 320 | * object and speeds up fork()/exit(), since a single inc/dec and a |
295 | * list_add()/del() can bump the reference count on the entire cgroup | 321 | * list_add()/del() can bump the reference count on the entire cgroup |
296 | * set for a task. | 322 | * set for a task. |
297 | */ | 323 | */ |
298 | 324 | ||
299 | struct css_set { | 325 | struct css_set { |
300 | 326 | ||
301 | /* Reference count */ | 327 | /* Reference count */ |
302 | atomic_t refcount; | 328 | atomic_t refcount; |
303 | 329 | ||
304 | /* | 330 | /* |
305 | * List running through all cgroup groups in the same hash | 331 | * List running through all cgroup groups in the same hash |
306 | * slot. Protected by css_set_lock | 332 | * slot. Protected by css_set_lock |
307 | */ | 333 | */ |
308 | struct hlist_node hlist; | 334 | struct hlist_node hlist; |
309 | 335 | ||
310 | /* | 336 | /* |
311 | * Lists running through all tasks using this cgroup group. | 337 | * Lists running through all tasks using this cgroup group. |
312 | * mg_tasks lists tasks which belong to this cset but are in the | 338 | * mg_tasks lists tasks which belong to this cset but are in the |
313 | * process of being migrated out or in. Protected by | 339 | * process of being migrated out or in. Protected by |
314 | * css_set_rwsem, but, during migration, once tasks are moved to | 340 | * css_set_rwsem, but, during migration, once tasks are moved to |
315 | * mg_tasks, it can be read safely while holding cgroup_mutex. | 341 | * mg_tasks, it can be read safely while holding cgroup_mutex. |
316 | */ | 342 | */ |
317 | struct list_head tasks; | 343 | struct list_head tasks; |
318 | struct list_head mg_tasks; | 344 | struct list_head mg_tasks; |
319 | 345 | ||
320 | /* | 346 | /* |
321 | * List of cgrp_cset_links pointing at cgroups referenced from this | 347 | * List of cgrp_cset_links pointing at cgroups referenced from this |
322 | * css_set. Protected by css_set_lock. | 348 | * css_set. Protected by css_set_lock. |
323 | */ | 349 | */ |
324 | struct list_head cgrp_links; | 350 | struct list_head cgrp_links; |
325 | 351 | ||
326 | /* the default cgroup associated with this css_set */ | 352 | /* the default cgroup associated with this css_set */ |
327 | struct cgroup *dfl_cgrp; | 353 | struct cgroup *dfl_cgrp; |
328 | 354 | ||
329 | /* | 355 | /* |
330 | * Set of subsystem states, one for each subsystem. This array is | 356 | * Set of subsystem states, one for each subsystem. This array is |
331 | * immutable after creation apart from the init_css_set during | 357 | * immutable after creation apart from the init_css_set during |
332 | * subsystem registration (at boot time). | 358 | * subsystem registration (at boot time). |
333 | */ | 359 | */ |
334 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; | 360 | struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; |
335 | 361 | ||
336 | /* | 362 | /* |
337 | * List of csets participating in the on-going migration either as | 363 | * List of csets participating in the on-going migration either as |
338 | * source or destination. Protected by cgroup_mutex. | 364 | * source or destination. Protected by cgroup_mutex. |
339 | */ | 365 | */ |
340 | struct list_head mg_preload_node; | 366 | struct list_head mg_preload_node; |
341 | struct list_head mg_node; | 367 | struct list_head mg_node; |
342 | 368 | ||
343 | /* | 369 | /* |
344 | * If this cset is acting as the source of migration the following | 370 | * If this cset is acting as the source of migration the following |
345 | * two fields are set. mg_src_cgrp is the source cgroup of the | 371 | * two fields are set. mg_src_cgrp is the source cgroup of the |
346 | * on-going migration and mg_dst_cset is the destination cset the | 372 | * on-going migration and mg_dst_cset is the destination cset the |
347 | * target tasks on this cset should be migrated to. Protected by | 373 | * target tasks on this cset should be migrated to. Protected by |
348 | * cgroup_mutex. | 374 | * cgroup_mutex. |
349 | */ | 375 | */ |
350 | struct cgroup *mg_src_cgrp; | 376 | struct cgroup *mg_src_cgrp; |
351 | struct css_set *mg_dst_cset; | 377 | struct css_set *mg_dst_cset; |
352 | 378 | ||
353 | /* | 379 | /* |
354 | * On the default hierarhcy, ->subsys[ssid] may point to a css | 380 | * On the default hierarhcy, ->subsys[ssid] may point to a css |
355 | * attached to an ancestor instead of the cgroup this css_set is | 381 | * attached to an ancestor instead of the cgroup this css_set is |
356 | * associated with. The following node is anchored at | 382 | * associated with. The following node is anchored at |
357 | * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to | 383 | * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to |
358 | * iterate through all css's attached to a given cgroup. | 384 | * iterate through all css's attached to a given cgroup. |
359 | */ | 385 | */ |
360 | struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; | 386 | struct list_head e_cset_node[CGROUP_SUBSYS_COUNT]; |
361 | 387 | ||
362 | /* For RCU-protected deletion */ | 388 | /* For RCU-protected deletion */ |
363 | struct rcu_head rcu_head; | 389 | struct rcu_head rcu_head; |
364 | }; | 390 | }; |
365 | 391 | ||
366 | /* | 392 | /* |
367 | * struct cftype: handler definitions for cgroup control files | 393 | * struct cftype: handler definitions for cgroup control files |
368 | * | 394 | * |
369 | * When reading/writing to a file: | 395 | * When reading/writing to a file: |
370 | * - the cgroup to use is file->f_dentry->d_parent->d_fsdata | 396 | * - the cgroup to use is file->f_dentry->d_parent->d_fsdata |
371 | * - the 'cftype' of the file is file->f_dentry->d_fsdata | 397 | * - the 'cftype' of the file is file->f_dentry->d_fsdata |
372 | */ | 398 | */ |
373 | 399 | ||
374 | /* cftype->flags */ | 400 | /* cftype->flags */ |
375 | enum { | 401 | enum { |
376 | CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ | 402 | CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ |
377 | CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ | 403 | CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ |
378 | CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ | 404 | CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ |
379 | 405 | ||
380 | /* internal flags, do not use outside cgroup core proper */ | 406 | /* internal flags, do not use outside cgroup core proper */ |
381 | __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ | 407 | __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ |
382 | __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ | 408 | __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ |
383 | }; | 409 | }; |
384 | 410 | ||
385 | #define MAX_CFTYPE_NAME 64 | 411 | #define MAX_CFTYPE_NAME 64 |
386 | 412 | ||
387 | struct cftype { | 413 | struct cftype { |
388 | /* | 414 | /* |
389 | * By convention, the name should begin with the name of the | 415 | * By convention, the name should begin with the name of the |
390 | * subsystem, followed by a period. Zero length string indicates | 416 | * subsystem, followed by a period. Zero length string indicates |
391 | * end of cftype array. | 417 | * end of cftype array. |
392 | */ | 418 | */ |
393 | char name[MAX_CFTYPE_NAME]; | 419 | char name[MAX_CFTYPE_NAME]; |
394 | int private; | 420 | int private; |
395 | /* | 421 | /* |
396 | * If not 0, file mode is set to this value, otherwise it will | 422 | * If not 0, file mode is set to this value, otherwise it will |
397 | * be figured out automatically | 423 | * be figured out automatically |
398 | */ | 424 | */ |
399 | umode_t mode; | 425 | umode_t mode; |
400 | 426 | ||
401 | /* | 427 | /* |
402 | * The maximum length of string, excluding trailing nul, that can | 428 | * The maximum length of string, excluding trailing nul, that can |
403 | * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. | 429 | * be passed to write. If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed. |
404 | */ | 430 | */ |
405 | size_t max_write_len; | 431 | size_t max_write_len; |
406 | 432 | ||
407 | /* CFTYPE_* flags */ | 433 | /* CFTYPE_* flags */ |
408 | unsigned int flags; | 434 | unsigned int flags; |
409 | 435 | ||
410 | /* | 436 | /* |
411 | * Fields used for internal bookkeeping. Initialized automatically | 437 | * Fields used for internal bookkeeping. Initialized automatically |
412 | * during registration. | 438 | * during registration. |
413 | */ | 439 | */ |
414 | struct cgroup_subsys *ss; /* NULL for cgroup core files */ | 440 | struct cgroup_subsys *ss; /* NULL for cgroup core files */ |
415 | struct list_head node; /* anchored at ss->cfts */ | 441 | struct list_head node; /* anchored at ss->cfts */ |
416 | struct kernfs_ops *kf_ops; | 442 | struct kernfs_ops *kf_ops; |
417 | 443 | ||
418 | /* | 444 | /* |
419 | * read_u64() is a shortcut for the common case of returning a | 445 | * read_u64() is a shortcut for the common case of returning a |
420 | * single integer. Use it in place of read() | 446 | * single integer. Use it in place of read() |
421 | */ | 447 | */ |
422 | u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); | 448 | u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); |
423 | /* | 449 | /* |
424 | * read_s64() is a signed version of read_u64() | 450 | * read_s64() is a signed version of read_u64() |
425 | */ | 451 | */ |
426 | s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); | 452 | s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); |
427 | 453 | ||
428 | /* generic seq_file read interface */ | 454 | /* generic seq_file read interface */ |
429 | int (*seq_show)(struct seq_file *sf, void *v); | 455 | int (*seq_show)(struct seq_file *sf, void *v); |
430 | 456 | ||
431 | /* optional ops, implement all or none */ | 457 | /* optional ops, implement all or none */ |
432 | void *(*seq_start)(struct seq_file *sf, loff_t *ppos); | 458 | void *(*seq_start)(struct seq_file *sf, loff_t *ppos); |
433 | void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); | 459 | void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); |
434 | void (*seq_stop)(struct seq_file *sf, void *v); | 460 | void (*seq_stop)(struct seq_file *sf, void *v); |
435 | 461 | ||
436 | /* | 462 | /* |
437 | * write_u64() is a shortcut for the common case of accepting | 463 | * write_u64() is a shortcut for the common case of accepting |
438 | * a single integer (as parsed by simple_strtoull) from | 464 | * a single integer (as parsed by simple_strtoull) from |
439 | * userspace. Use in place of write(); return 0 or error. | 465 | * userspace. Use in place of write(); return 0 or error. |
440 | */ | 466 | */ |
441 | int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, | 467 | int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, |
442 | u64 val); | 468 | u64 val); |
443 | /* | 469 | /* |
444 | * write_s64() is a signed version of write_u64() | 470 | * write_s64() is a signed version of write_u64() |
445 | */ | 471 | */ |
446 | int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, | 472 | int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, |
447 | s64 val); | 473 | s64 val); |
448 | 474 | ||
449 | /* | 475 | /* |
450 | * write() is the generic write callback which maps directly to | 476 | * write() is the generic write callback which maps directly to |
451 | * kernfs write operation and overrides all other operations. | 477 | * kernfs write operation and overrides all other operations. |
452 | * Maximum write size is determined by ->max_write_len. Use | 478 | * Maximum write size is determined by ->max_write_len. Use |
453 | * of_css/cft() to access the associated css and cft. | 479 | * of_css/cft() to access the associated css and cft. |
454 | */ | 480 | */ |
455 | ssize_t (*write)(struct kernfs_open_file *of, | 481 | ssize_t (*write)(struct kernfs_open_file *of, |
456 | char *buf, size_t nbytes, loff_t off); | 482 | char *buf, size_t nbytes, loff_t off); |
457 | 483 | ||
458 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 484 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
459 | struct lock_class_key lockdep_key; | 485 | struct lock_class_key lockdep_key; |
460 | #endif | 486 | #endif |
461 | }; | 487 | }; |
462 | 488 | ||
463 | extern struct cgroup_root cgrp_dfl_root; | 489 | extern struct cgroup_root cgrp_dfl_root; |
464 | extern struct css_set init_css_set; | 490 | extern struct css_set init_css_set; |
465 | 491 | ||
466 | /** | 492 | /** |
467 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy | 493 | * cgroup_on_dfl - test whether a cgroup is on the default hierarchy |
468 | * @cgrp: the cgroup of interest | 494 | * @cgrp: the cgroup of interest |
469 | * | 495 | * |
470 | * The default hierarchy is the v2 interface of cgroup and this function | 496 | * The default hierarchy is the v2 interface of cgroup and this function |
471 | * can be used to test whether a cgroup is on the default hierarchy for | 497 | * can be used to test whether a cgroup is on the default hierarchy for |
472 | * cases where a subsystem should behave differnetly depending on the | 498 | * cases where a subsystem should behave differnetly depending on the |
473 | * interface version. | 499 | * interface version. |
474 | * | 500 | * |
475 | * The set of behaviors which change on the default hierarchy are still | 501 | * The set of behaviors which change on the default hierarchy are still |
476 | * being determined and the mount option is prefixed with __DEVEL__. | 502 | * being determined and the mount option is prefixed with __DEVEL__. |
477 | * | 503 | * |
478 | * List of changed behaviors: | 504 | * List of changed behaviors: |
479 | * | 505 | * |
480 | * - Mount options "noprefix", "xattr", "clone_children", "release_agent" | 506 | * - Mount options "noprefix", "xattr", "clone_children", "release_agent" |
481 | * and "name" are disallowed. | 507 | * and "name" are disallowed. |
482 | * | 508 | * |
483 | * - When mounting an existing superblock, mount options should match. | 509 | * - When mounting an existing superblock, mount options should match. |
484 | * | 510 | * |
485 | * - Remount is disallowed. | 511 | * - Remount is disallowed. |
486 | * | 512 | * |
487 | * - rename(2) is disallowed. | 513 | * - rename(2) is disallowed. |
488 | * | 514 | * |
489 | * - "tasks" is removed. Everything should be at process granularity. Use | 515 | * - "tasks" is removed. Everything should be at process granularity. Use |
490 | * "cgroup.procs" instead. | 516 | * "cgroup.procs" instead. |
491 | * | 517 | * |
492 | * - "cgroup.procs" is not sorted. pids will be unique unless they got | 518 | * - "cgroup.procs" is not sorted. pids will be unique unless they got |
493 | * recycled inbetween reads. | 519 | * recycled inbetween reads. |
494 | * | 520 | * |
495 | * - "release_agent" and "notify_on_release" are removed. Replacement | 521 | * - "release_agent" and "notify_on_release" are removed. Replacement |
496 | * notification mechanism will be implemented. | 522 | * notification mechanism will be implemented. |
497 | * | 523 | * |
498 | * - "cgroup.clone_children" is removed. | 524 | * - "cgroup.clone_children" is removed. |
499 | * | 525 | * |
500 | * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup | 526 | * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup |
501 | * and its descendants contain no task; otherwise, 1. The file also | 527 | * and its descendants contain no task; otherwise, 1. The file also |
502 | * generates kernfs notification which can be monitored through poll and | 528 | * generates kernfs notification which can be monitored through poll and |
503 | * [di]notify when the value of the file changes. | 529 | * [di]notify when the value of the file changes. |
504 | * | 530 | * |
505 | * - cpuset: tasks will be kept in empty cpusets when hotplug happens and | 531 | * - cpuset: tasks will be kept in empty cpusets when hotplug happens and |
506 | * take masks of ancestors with non-empty cpus/mems, instead of being | 532 | * take masks of ancestors with non-empty cpus/mems, instead of being |
507 | * moved to an ancestor. | 533 | * moved to an ancestor. |
508 | * | 534 | * |
509 | * - cpuset: a task can be moved into an empty cpuset, and again it takes | 535 | * - cpuset: a task can be moved into an empty cpuset, and again it takes |
510 | * masks of ancestors. | 536 | * masks of ancestors. |
511 | * | 537 | * |
512 | * - memcg: use_hierarchy is on by default and the cgroup file for the flag | 538 | * - memcg: use_hierarchy is on by default and the cgroup file for the flag |
513 | * is not created. | 539 | * is not created. |
514 | * | 540 | * |
515 | * - blkcg: blk-throttle becomes properly hierarchical. | 541 | * - blkcg: blk-throttle becomes properly hierarchical. |
516 | * | 542 | * |
517 | * - debug: disallowed on the default hierarchy. | 543 | * - debug: disallowed on the default hierarchy. |
518 | */ | 544 | */ |
519 | static inline bool cgroup_on_dfl(const struct cgroup *cgrp) | 545 | static inline bool cgroup_on_dfl(const struct cgroup *cgrp) |
520 | { | 546 | { |
521 | return cgrp->root == &cgrp_dfl_root; | 547 | return cgrp->root == &cgrp_dfl_root; |
522 | } | 548 | } |
523 | 549 | ||
524 | /* no synchronization, the result can only be used as a hint */ | 550 | /* no synchronization, the result can only be used as a hint */ |
525 | static inline bool cgroup_has_tasks(struct cgroup *cgrp) | 551 | static inline bool cgroup_has_tasks(struct cgroup *cgrp) |
526 | { | 552 | { |
527 | return !list_empty(&cgrp->cset_links); | 553 | return !list_empty(&cgrp->cset_links); |
528 | } | 554 | } |
529 | 555 | ||
530 | /* returns ino associated with a cgroup */ | 556 | /* returns ino associated with a cgroup */ |
531 | static inline ino_t cgroup_ino(struct cgroup *cgrp) | 557 | static inline ino_t cgroup_ino(struct cgroup *cgrp) |
532 | { | 558 | { |
533 | return cgrp->kn->ino; | 559 | return cgrp->kn->ino; |
534 | } | 560 | } |
535 | 561 | ||
536 | /* cft/css accessors for cftype->write() operation */ | 562 | /* cft/css accessors for cftype->write() operation */ |
537 | static inline struct cftype *of_cft(struct kernfs_open_file *of) | 563 | static inline struct cftype *of_cft(struct kernfs_open_file *of) |
538 | { | 564 | { |
539 | return of->kn->priv; | 565 | return of->kn->priv; |
540 | } | 566 | } |
541 | 567 | ||
542 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); | 568 | struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); |
543 | 569 | ||
544 | /* cft/css accessors for cftype->seq_*() operations */ | 570 | /* cft/css accessors for cftype->seq_*() operations */ |
545 | static inline struct cftype *seq_cft(struct seq_file *seq) | 571 | static inline struct cftype *seq_cft(struct seq_file *seq) |
546 | { | 572 | { |
547 | return of_cft(seq->private); | 573 | return of_cft(seq->private); |
548 | } | 574 | } |
549 | 575 | ||
550 | static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) | 576 | static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) |
551 | { | 577 | { |
552 | return of_css(seq->private); | 578 | return of_css(seq->private); |
553 | } | 579 | } |
554 | 580 | ||
555 | /* | 581 | /* |
556 | * Name / path handling functions. All are thin wrappers around the kernfs | 582 | * Name / path handling functions. All are thin wrappers around the kernfs |
557 | * counterparts and can be called under any context. | 583 | * counterparts and can be called under any context. |
558 | */ | 584 | */ |
559 | 585 | ||
560 | static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) | 586 | static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) |
561 | { | 587 | { |
562 | return kernfs_name(cgrp->kn, buf, buflen); | 588 | return kernfs_name(cgrp->kn, buf, buflen); |
563 | } | 589 | } |
564 | 590 | ||
565 | static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, | 591 | static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, |
566 | size_t buflen) | 592 | size_t buflen) |
567 | { | 593 | { |
568 | return kernfs_path(cgrp->kn, buf, buflen); | 594 | return kernfs_path(cgrp->kn, buf, buflen); |
569 | } | 595 | } |
570 | 596 | ||
571 | static inline void pr_cont_cgroup_name(struct cgroup *cgrp) | 597 | static inline void pr_cont_cgroup_name(struct cgroup *cgrp) |
572 | { | 598 | { |
573 | pr_cont_kernfs_name(cgrp->kn); | 599 | pr_cont_kernfs_name(cgrp->kn); |
574 | } | 600 | } |
575 | 601 | ||
576 | static inline void pr_cont_cgroup_path(struct cgroup *cgrp) | 602 | static inline void pr_cont_cgroup_path(struct cgroup *cgrp) |
577 | { | 603 | { |
578 | pr_cont_kernfs_path(cgrp->kn); | 604 | pr_cont_kernfs_path(cgrp->kn); |
579 | } | 605 | } |
580 | 606 | ||
581 | char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); | 607 | char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); |
582 | 608 | ||
583 | int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | 609 | int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); |
584 | int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); | 610 | int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); |
585 | int cgroup_rm_cftypes(struct cftype *cfts); | 611 | int cgroup_rm_cftypes(struct cftype *cfts); |
586 | 612 | ||
587 | bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); | 613 | bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); |
588 | 614 | ||
589 | /* | 615 | /* |
590 | * Control Group taskset, used to pass around set of tasks to cgroup_subsys | 616 | * Control Group taskset, used to pass around set of tasks to cgroup_subsys |
591 | * methods. | 617 | * methods. |
592 | */ | 618 | */ |
593 | struct cgroup_taskset; | 619 | struct cgroup_taskset; |
594 | struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); | 620 | struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); |
595 | struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); | 621 | struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); |
596 | 622 | ||
597 | /** | 623 | /** |
598 | * cgroup_taskset_for_each - iterate cgroup_taskset | 624 | * cgroup_taskset_for_each - iterate cgroup_taskset |
599 | * @task: the loop cursor | 625 | * @task: the loop cursor |
600 | * @tset: taskset to iterate | 626 | * @tset: taskset to iterate |
601 | */ | 627 | */ |
602 | #define cgroup_taskset_for_each(task, tset) \ | 628 | #define cgroup_taskset_for_each(task, tset) \ |
603 | for ((task) = cgroup_taskset_first((tset)); (task); \ | 629 | for ((task) = cgroup_taskset_first((tset)); (task); \ |
604 | (task) = cgroup_taskset_next((tset))) | 630 | (task) = cgroup_taskset_next((tset))) |
605 | 631 | ||
606 | /* | 632 | /* |
607 | * Control Group subsystem type. | 633 | * Control Group subsystem type. |
608 | * See Documentation/cgroups/cgroups.txt for details | 634 | * See Documentation/cgroups/cgroups.txt for details |
609 | */ | 635 | */ |
610 | 636 | ||
611 | struct cgroup_subsys { | 637 | struct cgroup_subsys { |
612 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); | 638 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); |
613 | int (*css_online)(struct cgroup_subsys_state *css); | 639 | int (*css_online)(struct cgroup_subsys_state *css); |
614 | void (*css_offline)(struct cgroup_subsys_state *css); | 640 | void (*css_offline)(struct cgroup_subsys_state *css); |
615 | void (*css_free)(struct cgroup_subsys_state *css); | 641 | void (*css_free)(struct cgroup_subsys_state *css); |
616 | void (*css_reset)(struct cgroup_subsys_state *css); | 642 | void (*css_reset)(struct cgroup_subsys_state *css); |
617 | 643 | ||
618 | int (*can_attach)(struct cgroup_subsys_state *css, | 644 | int (*can_attach)(struct cgroup_subsys_state *css, |
619 | struct cgroup_taskset *tset); | 645 | struct cgroup_taskset *tset); |
620 | void (*cancel_attach)(struct cgroup_subsys_state *css, | 646 | void (*cancel_attach)(struct cgroup_subsys_state *css, |
621 | struct cgroup_taskset *tset); | 647 | struct cgroup_taskset *tset); |
622 | void (*attach)(struct cgroup_subsys_state *css, | 648 | void (*attach)(struct cgroup_subsys_state *css, |
623 | struct cgroup_taskset *tset); | 649 | struct cgroup_taskset *tset); |
624 | void (*fork)(struct task_struct *task); | 650 | void (*fork)(struct task_struct *task); |
625 | void (*exit)(struct cgroup_subsys_state *css, | 651 | void (*exit)(struct cgroup_subsys_state *css, |
626 | struct cgroup_subsys_state *old_css, | 652 | struct cgroup_subsys_state *old_css, |
627 | struct task_struct *task); | 653 | struct task_struct *task); |
628 | void (*bind)(struct cgroup_subsys_state *root_css); | 654 | void (*bind)(struct cgroup_subsys_state *root_css); |
629 | 655 | ||
630 | int disabled; | 656 | int disabled; |
631 | int early_init; | 657 | int early_init; |
632 | 658 | ||
633 | /* | 659 | /* |
634 | * If %false, this subsystem is properly hierarchical - | 660 | * If %false, this subsystem is properly hierarchical - |
635 | * configuration, resource accounting and restriction on a parent | 661 | * configuration, resource accounting and restriction on a parent |
636 | * cgroup cover those of its children. If %true, hierarchy support | 662 | * cgroup cover those of its children. If %true, hierarchy support |
637 | * is broken in some ways - some subsystems ignore hierarchy | 663 | * is broken in some ways - some subsystems ignore hierarchy |
638 | * completely while others are only implemented half-way. | 664 | * completely while others are only implemented half-way. |
639 | * | 665 | * |
640 | * It's now disallowed to create nested cgroups if the subsystem is | 666 | * It's now disallowed to create nested cgroups if the subsystem is |
641 | * broken and cgroup core will emit a warning message on such | 667 | * broken and cgroup core will emit a warning message on such |
642 | * cases. Eventually, all subsystems will be made properly | 668 | * cases. Eventually, all subsystems will be made properly |
643 | * hierarchical and this will go away. | 669 | * hierarchical and this will go away. |
644 | */ | 670 | */ |
645 | bool broken_hierarchy; | 671 | bool broken_hierarchy; |
646 | bool warned_broken_hierarchy; | 672 | bool warned_broken_hierarchy; |
647 | 673 | ||
648 | /* the following two fields are initialized automtically during boot */ | 674 | /* the following two fields are initialized automtically during boot */ |
649 | int id; | 675 | int id; |
650 | #define MAX_CGROUP_TYPE_NAMELEN 32 | 676 | #define MAX_CGROUP_TYPE_NAMELEN 32 |
651 | const char *name; | 677 | const char *name; |
652 | 678 | ||
653 | /* link to parent, protected by cgroup_lock() */ | 679 | /* link to parent, protected by cgroup_lock() */ |
654 | struct cgroup_root *root; | 680 | struct cgroup_root *root; |
655 | 681 | ||
656 | /* idr for css->id */ | 682 | /* idr for css->id */ |
657 | struct idr css_idr; | 683 | struct idr css_idr; |
658 | 684 | ||
659 | /* | 685 | /* |
660 | * List of cftypes. Each entry is the first entry of an array | 686 | * List of cftypes. Each entry is the first entry of an array |
661 | * terminated by zero length name. | 687 | * terminated by zero length name. |
662 | */ | 688 | */ |
663 | struct list_head cfts; | 689 | struct list_head cfts; |
664 | 690 | ||
665 | /* | 691 | /* |
666 | * Base cftypes which are automatically registered. The two can | 692 | * Base cftypes which are automatically registered. The two can |
667 | * point to the same array. | 693 | * point to the same array. |
668 | */ | 694 | */ |
669 | struct cftype *dfl_cftypes; /* for the default hierarchy */ | 695 | struct cftype *dfl_cftypes; /* for the default hierarchy */ |
670 | struct cftype *legacy_cftypes; /* for the legacy hierarchies */ | 696 | struct cftype *legacy_cftypes; /* for the legacy hierarchies */ |
671 | 697 | ||
672 | /* | 698 | /* |
673 | * A subsystem may depend on other subsystems. When such subsystem | 699 | * A subsystem may depend on other subsystems. When such subsystem |
674 | * is enabled on a cgroup, the depended-upon subsystems are enabled | 700 | * is enabled on a cgroup, the depended-upon subsystems are enabled |
675 | * together if available. Subsystems enabled due to dependency are | 701 | * together if available. Subsystems enabled due to dependency are |
676 | * not visible to userland until explicitly enabled. The following | 702 | * not visible to userland until explicitly enabled. The following |
677 | * specifies the mask of subsystems that this one depends on. | 703 | * specifies the mask of subsystems that this one depends on. |
678 | */ | 704 | */ |
679 | unsigned int depends_on; | 705 | unsigned int depends_on; |
680 | }; | 706 | }; |
681 | 707 | ||
682 | #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; | 708 | #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; |
683 | #include <linux/cgroup_subsys.h> | 709 | #include <linux/cgroup_subsys.h> |
684 | #undef SUBSYS | 710 | #undef SUBSYS |
685 | 711 | ||
686 | /** | 712 | /** |
687 | * task_css_set_check - obtain a task's css_set with extra access conditions | 713 | * task_css_set_check - obtain a task's css_set with extra access conditions |
688 | * @task: the task to obtain css_set for | 714 | * @task: the task to obtain css_set for |
689 | * @__c: extra condition expression to be passed to rcu_dereference_check() | 715 | * @__c: extra condition expression to be passed to rcu_dereference_check() |
690 | * | 716 | * |
691 | * A task's css_set is RCU protected, initialized and exited while holding | 717 | * A task's css_set is RCU protected, initialized and exited while holding |
692 | * task_lock(), and can only be modified while holding both cgroup_mutex | 718 | * task_lock(), and can only be modified while holding both cgroup_mutex |
693 | * and task_lock() while the task is alive. This macro verifies that the | 719 | * and task_lock() while the task is alive. This macro verifies that the |
694 | * caller is inside proper critical section and returns @task's css_set. | 720 | * caller is inside proper critical section and returns @task's css_set. |
695 | * | 721 | * |
696 | * The caller can also specify additional allowed conditions via @__c, such | 722 | * The caller can also specify additional allowed conditions via @__c, such |
697 | * as locks used during the cgroup_subsys::attach() methods. | 723 | * as locks used during the cgroup_subsys::attach() methods. |
698 | */ | 724 | */ |
699 | #ifdef CONFIG_PROVE_RCU | 725 | #ifdef CONFIG_PROVE_RCU |
700 | extern struct mutex cgroup_mutex; | 726 | extern struct mutex cgroup_mutex; |
701 | extern struct rw_semaphore css_set_rwsem; | 727 | extern struct rw_semaphore css_set_rwsem; |
702 | #define task_css_set_check(task, __c) \ | 728 | #define task_css_set_check(task, __c) \ |
703 | rcu_dereference_check((task)->cgroups, \ | 729 | rcu_dereference_check((task)->cgroups, \ |
704 | lockdep_is_held(&cgroup_mutex) || \ | 730 | lockdep_is_held(&cgroup_mutex) || \ |
705 | lockdep_is_held(&css_set_rwsem) || \ | 731 | lockdep_is_held(&css_set_rwsem) || \ |
706 | ((task)->flags & PF_EXITING) || (__c)) | 732 | ((task)->flags & PF_EXITING) || (__c)) |
707 | #else | 733 | #else |
708 | #define task_css_set_check(task, __c) \ | 734 | #define task_css_set_check(task, __c) \ |
709 | rcu_dereference((task)->cgroups) | 735 | rcu_dereference((task)->cgroups) |
710 | #endif | 736 | #endif |
711 | 737 | ||
712 | /** | 738 | /** |
713 | * task_css_check - obtain css for (task, subsys) w/ extra access conds | 739 | * task_css_check - obtain css for (task, subsys) w/ extra access conds |
714 | * @task: the target task | 740 | * @task: the target task |
715 | * @subsys_id: the target subsystem ID | 741 | * @subsys_id: the target subsystem ID |
716 | * @__c: extra condition expression to be passed to rcu_dereference_check() | 742 | * @__c: extra condition expression to be passed to rcu_dereference_check() |
717 | * | 743 | * |
718 | * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The | 744 | * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The |
719 | * synchronization rules are the same as task_css_set_check(). | 745 | * synchronization rules are the same as task_css_set_check(). |
720 | */ | 746 | */ |
721 | #define task_css_check(task, subsys_id, __c) \ | 747 | #define task_css_check(task, subsys_id, __c) \ |
722 | task_css_set_check((task), (__c))->subsys[(subsys_id)] | 748 | task_css_set_check((task), (__c))->subsys[(subsys_id)] |
723 | 749 | ||
724 | /** | 750 | /** |
725 | * task_css_set - obtain a task's css_set | 751 | * task_css_set - obtain a task's css_set |
726 | * @task: the task to obtain css_set for | 752 | * @task: the task to obtain css_set for |
727 | * | 753 | * |
728 | * See task_css_set_check(). | 754 | * See task_css_set_check(). |
729 | */ | 755 | */ |
730 | static inline struct css_set *task_css_set(struct task_struct *task) | 756 | static inline struct css_set *task_css_set(struct task_struct *task) |
731 | { | 757 | { |
732 | return task_css_set_check(task, false); | 758 | return task_css_set_check(task, false); |
733 | } | 759 | } |
734 | 760 | ||
735 | /** | 761 | /** |
736 | * task_css - obtain css for (task, subsys) | 762 | * task_css - obtain css for (task, subsys) |
737 | * @task: the target task | 763 | * @task: the target task |
738 | * @subsys_id: the target subsystem ID | 764 | * @subsys_id: the target subsystem ID |
739 | * | 765 | * |
740 | * See task_css_check(). | 766 | * See task_css_check(). |
741 | */ | 767 | */ |
742 | static inline struct cgroup_subsys_state *task_css(struct task_struct *task, | 768 | static inline struct cgroup_subsys_state *task_css(struct task_struct *task, |
743 | int subsys_id) | 769 | int subsys_id) |
744 | { | 770 | { |
745 | return task_css_check(task, subsys_id, false); | 771 | return task_css_check(task, subsys_id, false); |
746 | } | 772 | } |
747 | 773 | ||
748 | /** | 774 | /** |
749 | * task_css_is_root - test whether a task belongs to the root css | 775 | * task_css_is_root - test whether a task belongs to the root css |
750 | * @task: the target task | 776 | * @task: the target task |
751 | * @subsys_id: the target subsystem ID | 777 | * @subsys_id: the target subsystem ID |
752 | * | 778 | * |
753 | * Test whether @task belongs to the root css on the specified subsystem. | 779 | * Test whether @task belongs to the root css on the specified subsystem. |
754 | * May be invoked in any context. | 780 | * May be invoked in any context. |
755 | */ | 781 | */ |
756 | static inline bool task_css_is_root(struct task_struct *task, int subsys_id) | 782 | static inline bool task_css_is_root(struct task_struct *task, int subsys_id) |
757 | { | 783 | { |
758 | return task_css_check(task, subsys_id, true) == | 784 | return task_css_check(task, subsys_id, true) == |
759 | init_css_set.subsys[subsys_id]; | 785 | init_css_set.subsys[subsys_id]; |
760 | } | 786 | } |
761 | 787 | ||
762 | static inline struct cgroup *task_cgroup(struct task_struct *task, | 788 | static inline struct cgroup *task_cgroup(struct task_struct *task, |
763 | int subsys_id) | 789 | int subsys_id) |
764 | { | 790 | { |
765 | return task_css(task, subsys_id)->cgroup; | 791 | return task_css(task, subsys_id)->cgroup; |
766 | } | 792 | } |
767 | 793 | ||
768 | struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, | 794 | struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, |
769 | struct cgroup_subsys_state *parent); | 795 | struct cgroup_subsys_state *parent); |
770 | 796 | ||
771 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); | 797 | struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); |
772 | 798 | ||
773 | /** | 799 | /** |
774 | * css_for_each_child - iterate through children of a css | 800 | * css_for_each_child - iterate through children of a css |
775 | * @pos: the css * to use as the loop cursor | 801 | * @pos: the css * to use as the loop cursor |
776 | * @parent: css whose children to walk | 802 | * @parent: css whose children to walk |
777 | * | 803 | * |
778 | * Walk @parent's children. Must be called under rcu_read_lock(). | 804 | * Walk @parent's children. Must be called under rcu_read_lock(). |
779 | * | 805 | * |
780 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | 806 | * If a subsystem synchronizes ->css_online() and the start of iteration, a |
781 | * css which finished ->css_online() is guaranteed to be visible in the | 807 | * css which finished ->css_online() is guaranteed to be visible in the |
782 | * future iterations and will stay visible until the last reference is put. | 808 | * future iterations and will stay visible until the last reference is put. |
783 | * A css which hasn't finished ->css_online() or already finished | 809 | * A css which hasn't finished ->css_online() or already finished |
784 | * ->css_offline() may show up during traversal. It's each subsystem's | 810 | * ->css_offline() may show up during traversal. It's each subsystem's |
785 | * responsibility to synchronize against on/offlining. | 811 | * responsibility to synchronize against on/offlining. |
786 | * | 812 | * |
787 | * It is allowed to temporarily drop RCU read lock during iteration. The | 813 | * It is allowed to temporarily drop RCU read lock during iteration. The |
788 | * caller is responsible for ensuring that @pos remains accessible until | 814 | * caller is responsible for ensuring that @pos remains accessible until |
789 | * the start of the next iteration by, for example, bumping the css refcnt. | 815 | * the start of the next iteration by, for example, bumping the css refcnt. |
790 | */ | 816 | */ |
791 | #define css_for_each_child(pos, parent) \ | 817 | #define css_for_each_child(pos, parent) \ |
792 | for ((pos) = css_next_child(NULL, (parent)); (pos); \ | 818 | for ((pos) = css_next_child(NULL, (parent)); (pos); \ |
793 | (pos) = css_next_child((pos), (parent))) | 819 | (pos) = css_next_child((pos), (parent))) |
794 | 820 | ||
795 | struct cgroup_subsys_state * | 821 | struct cgroup_subsys_state * |
796 | css_next_descendant_pre(struct cgroup_subsys_state *pos, | 822 | css_next_descendant_pre(struct cgroup_subsys_state *pos, |
797 | struct cgroup_subsys_state *css); | 823 | struct cgroup_subsys_state *css); |
798 | 824 | ||
799 | struct cgroup_subsys_state * | 825 | struct cgroup_subsys_state * |
800 | css_rightmost_descendant(struct cgroup_subsys_state *pos); | 826 | css_rightmost_descendant(struct cgroup_subsys_state *pos); |
801 | 827 | ||
802 | /** | 828 | /** |
803 | * css_for_each_descendant_pre - pre-order walk of a css's descendants | 829 | * css_for_each_descendant_pre - pre-order walk of a css's descendants |
804 | * @pos: the css * to use as the loop cursor | 830 | * @pos: the css * to use as the loop cursor |
805 | * @root: css whose descendants to walk | 831 | * @root: css whose descendants to walk |
806 | * | 832 | * |
807 | * Walk @root's descendants. @root is included in the iteration and the | 833 | * Walk @root's descendants. @root is included in the iteration and the |
808 | * first node to be visited. Must be called under rcu_read_lock(). | 834 | * first node to be visited. Must be called under rcu_read_lock(). |
809 | * | 835 | * |
810 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | 836 | * If a subsystem synchronizes ->css_online() and the start of iteration, a |
811 | * css which finished ->css_online() is guaranteed to be visible in the | 837 | * css which finished ->css_online() is guaranteed to be visible in the |
812 | * future iterations and will stay visible until the last reference is put. | 838 | * future iterations and will stay visible until the last reference is put. |
813 | * A css which hasn't finished ->css_online() or already finished | 839 | * A css which hasn't finished ->css_online() or already finished |
814 | * ->css_offline() may show up during traversal. It's each subsystem's | 840 | * ->css_offline() may show up during traversal. It's each subsystem's |
815 | * responsibility to synchronize against on/offlining. | 841 | * responsibility to synchronize against on/offlining. |
816 | * | 842 | * |
817 | * For example, the following guarantees that a descendant can't escape | 843 | * For example, the following guarantees that a descendant can't escape |
818 | * state updates of its ancestors. | 844 | * state updates of its ancestors. |
819 | * | 845 | * |
820 | * my_online(@css) | 846 | * my_online(@css) |
821 | * { | 847 | * { |
822 | * Lock @css's parent and @css; | 848 | * Lock @css's parent and @css; |
823 | * Inherit state from the parent; | 849 | * Inherit state from the parent; |
824 | * Unlock both. | 850 | * Unlock both. |
825 | * } | 851 | * } |
826 | * | 852 | * |
827 | * my_update_state(@css) | 853 | * my_update_state(@css) |
828 | * { | 854 | * { |
829 | * css_for_each_descendant_pre(@pos, @css) { | 855 | * css_for_each_descendant_pre(@pos, @css) { |
830 | * Lock @pos; | 856 | * Lock @pos; |
831 | * if (@pos == @css) | 857 | * if (@pos == @css) |
832 | * Update @css's state; | 858 | * Update @css's state; |
833 | * else | 859 | * else |
834 | * Verify @pos is alive and inherit state from its parent; | 860 | * Verify @pos is alive and inherit state from its parent; |
835 | * Unlock @pos; | 861 | * Unlock @pos; |
836 | * } | 862 | * } |
837 | * } | 863 | * } |
838 | * | 864 | * |
839 | * As long as the inheriting step, including checking the parent state, is | 865 | * As long as the inheriting step, including checking the parent state, is |
840 | * enclosed inside @pos locking, double-locking the parent isn't necessary | 866 | * enclosed inside @pos locking, double-locking the parent isn't necessary |
841 | * while inheriting. The state update to the parent is guaranteed to be | 867 | * while inheriting. The state update to the parent is guaranteed to be |
842 | * visible by walking order and, as long as inheriting operations to the | 868 | * visible by walking order and, as long as inheriting operations to the |
843 | * same @pos are atomic to each other, multiple updates racing each other | 869 | * same @pos are atomic to each other, multiple updates racing each other |
844 | * still result in the correct state. It's guaranateed that at least one | 870 | * still result in the correct state. It's guaranateed that at least one |
845 | * inheritance happens for any css after the latest update to its parent. | 871 | * inheritance happens for any css after the latest update to its parent. |
846 | * | 872 | * |
847 | * If checking parent's state requires locking the parent, each inheriting | 873 | * If checking parent's state requires locking the parent, each inheriting |
848 | * iteration should lock and unlock both @pos->parent and @pos. | 874 | * iteration should lock and unlock both @pos->parent and @pos. |
849 | * | 875 | * |
850 | * Alternatively, a subsystem may choose to use a single global lock to | 876 | * Alternatively, a subsystem may choose to use a single global lock to |
851 | * synchronize ->css_online() and ->css_offline() against tree-walking | 877 | * synchronize ->css_online() and ->css_offline() against tree-walking |
852 | * operations. | 878 | * operations. |
853 | * | 879 | * |
854 | * It is allowed to temporarily drop RCU read lock during iteration. The | 880 | * It is allowed to temporarily drop RCU read lock during iteration. The |
855 | * caller is responsible for ensuring that @pos remains accessible until | 881 | * caller is responsible for ensuring that @pos remains accessible until |
856 | * the start of the next iteration by, for example, bumping the css refcnt. | 882 | * the start of the next iteration by, for example, bumping the css refcnt. |
857 | */ | 883 | */ |
858 | #define css_for_each_descendant_pre(pos, css) \ | 884 | #define css_for_each_descendant_pre(pos, css) \ |
859 | for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ | 885 | for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ |
860 | (pos) = css_next_descendant_pre((pos), (css))) | 886 | (pos) = css_next_descendant_pre((pos), (css))) |
861 | 887 | ||
862 | struct cgroup_subsys_state * | 888 | struct cgroup_subsys_state * |
863 | css_next_descendant_post(struct cgroup_subsys_state *pos, | 889 | css_next_descendant_post(struct cgroup_subsys_state *pos, |
864 | struct cgroup_subsys_state *css); | 890 | struct cgroup_subsys_state *css); |
865 | 891 | ||
866 | /** | 892 | /** |
867 | * css_for_each_descendant_post - post-order walk of a css's descendants | 893 | * css_for_each_descendant_post - post-order walk of a css's descendants |
868 | * @pos: the css * to use as the loop cursor | 894 | * @pos: the css * to use as the loop cursor |
869 | * @css: css whose descendants to walk | 895 | * @css: css whose descendants to walk |
870 | * | 896 | * |
871 | * Similar to css_for_each_descendant_pre() but performs post-order | 897 | * Similar to css_for_each_descendant_pre() but performs post-order |
872 | * traversal instead. @root is included in the iteration and the last | 898 | * traversal instead. @root is included in the iteration and the last |
873 | * node to be visited. | 899 | * node to be visited. |
874 | * | 900 | * |
875 | * If a subsystem synchronizes ->css_online() and the start of iteration, a | 901 | * If a subsystem synchronizes ->css_online() and the start of iteration, a |
876 | * css which finished ->css_online() is guaranteed to be visible in the | 902 | * css which finished ->css_online() is guaranteed to be visible in the |
877 | * future iterations and will stay visible until the last reference is put. | 903 | * future iterations and will stay visible until the last reference is put. |
878 | * A css which hasn't finished ->css_online() or already finished | 904 | * A css which hasn't finished ->css_online() or already finished |
879 | * ->css_offline() may show up during traversal. It's each subsystem's | 905 | * ->css_offline() may show up during traversal. It's each subsystem's |
880 | * responsibility to synchronize against on/offlining. | 906 | * responsibility to synchronize against on/offlining. |
881 | * | 907 | * |
882 | * Note that the walk visibility guarantee example described in pre-order | 908 | * Note that the walk visibility guarantee example described in pre-order |
883 | * walk doesn't apply the same to post-order walks. | 909 | * walk doesn't apply the same to post-order walks. |
884 | */ | 910 | */ |
885 | #define css_for_each_descendant_post(pos, css) \ | 911 | #define css_for_each_descendant_post(pos, css) \ |
886 | for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ | 912 | for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ |
887 | (pos) = css_next_descendant_post((pos), (css))) | 913 | (pos) = css_next_descendant_post((pos), (css))) |
888 | 914 | ||
889 | bool css_has_online_children(struct cgroup_subsys_state *css); | 915 | bool css_has_online_children(struct cgroup_subsys_state *css); |
890 | 916 | ||
891 | /* A css_task_iter should be treated as an opaque object */ | 917 | /* A css_task_iter should be treated as an opaque object */ |
892 | struct css_task_iter { | 918 | struct css_task_iter { |
893 | struct cgroup_subsys *ss; | 919 | struct cgroup_subsys *ss; |
894 | 920 | ||
895 | struct list_head *cset_pos; | 921 | struct list_head *cset_pos; |
896 | struct list_head *cset_head; | 922 | struct list_head *cset_head; |
897 | 923 | ||
898 | struct list_head *task_pos; | 924 | struct list_head *task_pos; |
899 | struct list_head *tasks_head; | 925 | struct list_head *tasks_head; |
900 | struct list_head *mg_tasks_head; | 926 | struct list_head *mg_tasks_head; |
901 | }; | 927 | }; |
902 | 928 | ||
903 | void css_task_iter_start(struct cgroup_subsys_state *css, | 929 | void css_task_iter_start(struct cgroup_subsys_state *css, |
904 | struct css_task_iter *it); | 930 | struct css_task_iter *it); |
905 | struct task_struct *css_task_iter_next(struct css_task_iter *it); | 931 | struct task_struct *css_task_iter_next(struct css_task_iter *it); |
906 | void css_task_iter_end(struct css_task_iter *it); | 932 | void css_task_iter_end(struct css_task_iter *it); |
907 | 933 | ||
908 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); | 934 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); |
909 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); | 935 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); |
910 | 936 | ||
911 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | 937 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, |
912 | struct cgroup_subsys *ss); | 938 | struct cgroup_subsys *ss); |
913 | 939 | ||
914 | #else /* !CONFIG_CGROUPS */ | 940 | #else /* !CONFIG_CGROUPS */ |
915 | 941 | ||
916 | static inline int cgroup_init_early(void) { return 0; } | 942 | static inline int cgroup_init_early(void) { return 0; } |
917 | static inline int cgroup_init(void) { return 0; } | 943 | static inline int cgroup_init(void) { return 0; } |
918 | static inline void cgroup_fork(struct task_struct *p) {} | 944 | static inline void cgroup_fork(struct task_struct *p) {} |
919 | static inline void cgroup_post_fork(struct task_struct *p) {} | 945 | static inline void cgroup_post_fork(struct task_struct *p) {} |
920 | static inline void cgroup_exit(struct task_struct *p) {} | 946 | static inline void cgroup_exit(struct task_struct *p) {} |
921 | 947 | ||
922 | static inline int cgroupstats_build(struct cgroupstats *stats, | 948 | static inline int cgroupstats_build(struct cgroupstats *stats, |
923 | struct dentry *dentry) | 949 | struct dentry *dentry) |
924 | { | 950 | { |
925 | return -EINVAL; | 951 | return -EINVAL; |
926 | } | 952 | } |
927 | 953 | ||
928 | /* No cgroups - nothing to do */ | 954 | /* No cgroups - nothing to do */ |
929 | static inline int cgroup_attach_task_all(struct task_struct *from, | 955 | static inline int cgroup_attach_task_all(struct task_struct *from, |
930 | struct task_struct *t) | 956 | struct task_struct *t) |
931 | { | 957 | { |
932 | return 0; | 958 | return 0; |
933 | } | 959 | } |
934 | 960 | ||
935 | #endif /* !CONFIG_CGROUPS */ | 961 | #endif /* !CONFIG_CGROUPS */ |
936 | 962 | ||
937 | #endif /* _LINUX_CGROUP_H */ | 963 | #endif /* _LINUX_CGROUP_H */ |
938 | 964 |
include/linux/percpu-refcount.h
1 | /* | 1 | /* |
2 | * Percpu refcounts: | 2 | * Percpu refcounts: |
3 | * (C) 2012 Google, Inc. | 3 | * (C) 2012 Google, Inc. |
4 | * Author: Kent Overstreet <koverstreet@google.com> | 4 | * Author: Kent Overstreet <koverstreet@google.com> |
5 | * | 5 | * |
6 | * This implements a refcount with similar semantics to atomic_t - atomic_inc(), | 6 | * This implements a refcount with similar semantics to atomic_t - atomic_inc(), |
7 | * atomic_dec_and_test() - but percpu. | 7 | * atomic_dec_and_test() - but percpu. |
8 | * | 8 | * |
9 | * There's one important difference between percpu refs and normal atomic_t | 9 | * There's one important difference between percpu refs and normal atomic_t |
10 | * refcounts; you have to keep track of your initial refcount, and then when you | 10 | * refcounts; you have to keep track of your initial refcount, and then when you |
11 | * start shutting down you call percpu_ref_kill() _before_ dropping the initial | 11 | * start shutting down you call percpu_ref_kill() _before_ dropping the initial |
12 | * refcount. | 12 | * refcount. |
13 | * | 13 | * |
14 | * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less | 14 | * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less |
15 | * than an atomic_t - this is because of the way shutdown works, see | 15 | * than an atomic_t - this is because of the way shutdown works, see |
16 | * percpu_ref_kill()/PERCPU_COUNT_BIAS. | 16 | * percpu_ref_kill()/PERCPU_COUNT_BIAS. |
17 | * | 17 | * |
18 | * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the | 18 | * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the |
19 | * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() | 19 | * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() |
20 | * puts the ref back in single atomic_t mode, collecting the per cpu refs and | 20 | * puts the ref back in single atomic_t mode, collecting the per cpu refs and |
21 | * issuing the appropriate barriers, and then marks the ref as shutting down so | 21 | * issuing the appropriate barriers, and then marks the ref as shutting down so |
22 | * that percpu_ref_put() will check for the ref hitting 0. After it returns, | 22 | * that percpu_ref_put() will check for the ref hitting 0. After it returns, |
23 | * it's safe to drop the initial ref. | 23 | * it's safe to drop the initial ref. |
24 | * | 24 | * |
25 | * USAGE: | 25 | * USAGE: |
26 | * | 26 | * |
27 | * See fs/aio.c for some example usage; it's used there for struct kioctx, which | 27 | * See fs/aio.c for some example usage; it's used there for struct kioctx, which |
28 | * is created when userspaces calls io_setup(), and destroyed when userspace | 28 | * is created when userspaces calls io_setup(), and destroyed when userspace |
29 | * calls io_destroy() or the process exits. | 29 | * calls io_destroy() or the process exits. |
30 | * | 30 | * |
31 | * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it | 31 | * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it |
32 | * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove | 32 | * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to remove |
33 | * the kioctx from the proccess's list of kioctxs - after that, there can't be | 33 | * the kioctx from the proccess's list of kioctxs - after that, there can't be |
34 | * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop | 34 | * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop |
35 | * the initial ref with percpu_ref_put(). | 35 | * the initial ref with percpu_ref_put(). |
36 | * | 36 | * |
37 | * Code that does a two stage shutdown like this often needs some kind of | 37 | * Code that does a two stage shutdown like this often needs some kind of |
38 | * explicit synchronization to ensure the initial refcount can only be dropped | 38 | * explicit synchronization to ensure the initial refcount can only be dropped |
39 | * once - percpu_ref_kill() does this for you, it returns true once and false if | 39 | * once - percpu_ref_kill() does this for you, it returns true once and false if |
40 | * someone else already called it. The aio code uses it this way, but it's not | 40 | * someone else already called it. The aio code uses it this way, but it's not |
41 | * necessary if the code has some other mechanism to synchronize teardown. | 41 | * necessary if the code has some other mechanism to synchronize teardown. |
42 | * around. | 42 | * around. |
43 | */ | 43 | */ |
44 | 44 | ||
45 | #ifndef _LINUX_PERCPU_REFCOUNT_H | 45 | #ifndef _LINUX_PERCPU_REFCOUNT_H |
46 | #define _LINUX_PERCPU_REFCOUNT_H | 46 | #define _LINUX_PERCPU_REFCOUNT_H |
47 | 47 | ||
48 | #include <linux/atomic.h> | 48 | #include <linux/atomic.h> |
49 | #include <linux/kernel.h> | 49 | #include <linux/kernel.h> |
50 | #include <linux/percpu.h> | 50 | #include <linux/percpu.h> |
51 | #include <linux/rcupdate.h> | 51 | #include <linux/rcupdate.h> |
52 | #include <linux/gfp.h> | 52 | #include <linux/gfp.h> |
53 | 53 | ||
54 | struct percpu_ref; | 54 | struct percpu_ref; |
55 | typedef void (percpu_ref_func_t)(struct percpu_ref *); | 55 | typedef void (percpu_ref_func_t)(struct percpu_ref *); |
56 | 56 | ||
57 | /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ | 57 | /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ |
58 | enum { | 58 | enum { |
59 | __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ | 59 | __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ |
60 | __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ | 60 | __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ |
61 | __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, | 61 | __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, |
62 | 62 | ||
63 | __PERCPU_REF_FLAG_BITS = 2, | 63 | __PERCPU_REF_FLAG_BITS = 2, |
64 | }; | 64 | }; |
65 | 65 | ||
66 | /* @flags for percpu_ref_init() */ | 66 | /* @flags for percpu_ref_init() */ |
67 | enum { | 67 | enum { |
68 | /* | 68 | /* |
69 | * Start w/ ref == 1 in atomic mode. Can be switched to percpu | 69 | * Start w/ ref == 1 in atomic mode. Can be switched to percpu |
70 | * operation using percpu_ref_switch_to_percpu(). If initialized | 70 | * operation using percpu_ref_switch_to_percpu(). If initialized |
71 | * with this flag, the ref will stay in atomic mode until | 71 | * with this flag, the ref will stay in atomic mode until |
72 | * percpu_ref_switch_to_percpu() is invoked on it. | 72 | * percpu_ref_switch_to_percpu() is invoked on it. |
73 | */ | 73 | */ |
74 | PERCPU_REF_INIT_ATOMIC = 1 << 0, | 74 | PERCPU_REF_INIT_ATOMIC = 1 << 0, |
75 | 75 | ||
76 | /* | 76 | /* |
77 | * Start dead w/ ref == 0 in atomic mode. Must be revived with | 77 | * Start dead w/ ref == 0 in atomic mode. Must be revived with |
78 | * percpu_ref_reinit() before used. Implies INIT_ATOMIC. | 78 | * percpu_ref_reinit() before used. Implies INIT_ATOMIC. |
79 | */ | 79 | */ |
80 | PERCPU_REF_INIT_DEAD = 1 << 1, | 80 | PERCPU_REF_INIT_DEAD = 1 << 1, |
81 | }; | 81 | }; |
82 | 82 | ||
83 | struct percpu_ref { | 83 | struct percpu_ref { |
84 | atomic_long_t count; | 84 | atomic_long_t count; |
85 | /* | 85 | /* |
86 | * The low bit of the pointer indicates whether the ref is in percpu | 86 | * The low bit of the pointer indicates whether the ref is in percpu |
87 | * mode; if set, then get/put will manipulate the atomic_t. | 87 | * mode; if set, then get/put will manipulate the atomic_t. |
88 | */ | 88 | */ |
89 | unsigned long percpu_count_ptr; | 89 | unsigned long percpu_count_ptr; |
90 | percpu_ref_func_t *release; | 90 | percpu_ref_func_t *release; |
91 | percpu_ref_func_t *confirm_switch; | 91 | percpu_ref_func_t *confirm_switch; |
92 | bool force_atomic:1; | 92 | bool force_atomic:1; |
93 | struct rcu_head rcu; | 93 | struct rcu_head rcu; |
94 | }; | 94 | }; |
95 | 95 | ||
96 | int __must_check percpu_ref_init(struct percpu_ref *ref, | 96 | int __must_check percpu_ref_init(struct percpu_ref *ref, |
97 | percpu_ref_func_t *release, unsigned int flags, | 97 | percpu_ref_func_t *release, unsigned int flags, |
98 | gfp_t gfp); | 98 | gfp_t gfp); |
99 | void percpu_ref_exit(struct percpu_ref *ref); | 99 | void percpu_ref_exit(struct percpu_ref *ref); |
100 | void percpu_ref_switch_to_atomic(struct percpu_ref *ref, | 100 | void percpu_ref_switch_to_atomic(struct percpu_ref *ref, |
101 | percpu_ref_func_t *confirm_switch); | 101 | percpu_ref_func_t *confirm_switch); |
102 | void percpu_ref_switch_to_percpu(struct percpu_ref *ref); | 102 | void percpu_ref_switch_to_percpu(struct percpu_ref *ref); |
103 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, | 103 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, |
104 | percpu_ref_func_t *confirm_kill); | 104 | percpu_ref_func_t *confirm_kill); |
105 | void percpu_ref_reinit(struct percpu_ref *ref); | 105 | void percpu_ref_reinit(struct percpu_ref *ref); |
106 | 106 | ||
107 | /** | 107 | /** |
108 | * percpu_ref_kill - drop the initial ref | 108 | * percpu_ref_kill - drop the initial ref |
109 | * @ref: percpu_ref to kill | 109 | * @ref: percpu_ref to kill |
110 | * | 110 | * |
111 | * Must be used to drop the initial ref on a percpu refcount; must be called | 111 | * Must be used to drop the initial ref on a percpu refcount; must be called |
112 | * precisely once before shutdown. | 112 | * precisely once before shutdown. |
113 | * | 113 | * |
114 | * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the | 114 | * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the |
115 | * percpu counters and dropping the initial ref. | 115 | * percpu counters and dropping the initial ref. |
116 | */ | 116 | */ |
117 | static inline void percpu_ref_kill(struct percpu_ref *ref) | 117 | static inline void percpu_ref_kill(struct percpu_ref *ref) |
118 | { | 118 | { |
119 | return percpu_ref_kill_and_confirm(ref, NULL); | 119 | return percpu_ref_kill_and_confirm(ref, NULL); |
120 | } | 120 | } |
121 | 121 | ||
122 | /* | 122 | /* |
123 | * Internal helper. Don't use outside percpu-refcount proper. The | 123 | * Internal helper. Don't use outside percpu-refcount proper. The |
124 | * function doesn't return the pointer and let the caller test it for NULL | 124 | * function doesn't return the pointer and let the caller test it for NULL |
125 | * because doing so forces the compiler to generate two conditional | 125 | * because doing so forces the compiler to generate two conditional |
126 | * branches as it can't assume that @ref->percpu_count is not NULL. | 126 | * branches as it can't assume that @ref->percpu_count is not NULL. |
127 | */ | 127 | */ |
128 | static inline bool __ref_is_percpu(struct percpu_ref *ref, | 128 | static inline bool __ref_is_percpu(struct percpu_ref *ref, |
129 | unsigned long __percpu **percpu_countp) | 129 | unsigned long __percpu **percpu_countp) |
130 | { | 130 | { |
131 | unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); | 131 | unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); |
132 | 132 | ||
133 | /* paired with smp_store_release() in percpu_ref_reinit() */ | 133 | /* paired with smp_store_release() in percpu_ref_reinit() */ |
134 | smp_read_barrier_depends(); | 134 | smp_read_barrier_depends(); |
135 | 135 | ||
136 | /* | 136 | /* |
137 | * Theoretically, the following could test just ATOMIC; however, | 137 | * Theoretically, the following could test just ATOMIC; however, |
138 | * then we'd have to mask off DEAD separately as DEAD may be | 138 | * then we'd have to mask off DEAD separately as DEAD may be |
139 | * visible without ATOMIC if we race with percpu_ref_kill(). DEAD | 139 | * visible without ATOMIC if we race with percpu_ref_kill(). DEAD |
140 | * implies ATOMIC anyway. Test them together. | 140 | * implies ATOMIC anyway. Test them together. |
141 | */ | 141 | */ |
142 | if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) | 142 | if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD)) |
143 | return false; | 143 | return false; |
144 | 144 | ||
145 | *percpu_countp = (unsigned long __percpu *)percpu_ptr; | 145 | *percpu_countp = (unsigned long __percpu *)percpu_ptr; |
146 | return true; | 146 | return true; |
147 | } | 147 | } |
148 | 148 | ||
149 | /** | 149 | /** |
150 | * percpu_ref_get - increment a percpu refcount | 150 | * percpu_ref_get_many - increment a percpu refcount |
151 | * @ref: percpu_ref to get | 151 | * @ref: percpu_ref to get |
152 | * @nr: number of references to get | ||
152 | * | 153 | * |
153 | * Analagous to atomic_long_inc(). | 154 | * Analogous to atomic_long_add(). |
154 | * | 155 | * |
155 | * This function is safe to call as long as @ref is between init and exit. | 156 | * This function is safe to call as long as @ref is between init and exit. |
156 | */ | 157 | */ |
157 | static inline void percpu_ref_get(struct percpu_ref *ref) | 158 | static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr) |
158 | { | 159 | { |
159 | unsigned long __percpu *percpu_count; | 160 | unsigned long __percpu *percpu_count; |
160 | 161 | ||
161 | rcu_read_lock_sched(); | 162 | rcu_read_lock_sched(); |
162 | 163 | ||
163 | if (__ref_is_percpu(ref, &percpu_count)) | 164 | if (__ref_is_percpu(ref, &percpu_count)) |
164 | this_cpu_inc(*percpu_count); | 165 | this_cpu_add(*percpu_count, nr); |
165 | else | 166 | else |
166 | atomic_long_inc(&ref->count); | 167 | atomic_long_add(nr, &ref->count); |
167 | 168 | ||
168 | rcu_read_unlock_sched(); | 169 | rcu_read_unlock_sched(); |
169 | } | 170 | } |
170 | 171 | ||
171 | /** | 172 | /** |
173 | * percpu_ref_get - increment a percpu refcount | ||
174 | * @ref: percpu_ref to get | ||
175 | * | ||
176 | * Analagous to atomic_long_inc(). | ||
177 | * | ||
178 | * This function is safe to call as long as @ref is between init and exit. | ||
179 | */ | ||
180 | static inline void percpu_ref_get(struct percpu_ref *ref) | ||
181 | { | ||
182 | percpu_ref_get_many(ref, 1); | ||
183 | } | ||
184 | |||
185 | /** | ||
172 | * percpu_ref_tryget - try to increment a percpu refcount | 186 | * percpu_ref_tryget - try to increment a percpu refcount |
173 | * @ref: percpu_ref to try-get | 187 | * @ref: percpu_ref to try-get |
174 | * | 188 | * |
175 | * Increment a percpu refcount unless its count already reached zero. | 189 | * Increment a percpu refcount unless its count already reached zero. |
176 | * Returns %true on success; %false on failure. | 190 | * Returns %true on success; %false on failure. |
177 | * | 191 | * |
178 | * This function is safe to call as long as @ref is between init and exit. | 192 | * This function is safe to call as long as @ref is between init and exit. |
179 | */ | 193 | */ |
180 | static inline bool percpu_ref_tryget(struct percpu_ref *ref) | 194 | static inline bool percpu_ref_tryget(struct percpu_ref *ref) |
181 | { | 195 | { |
182 | unsigned long __percpu *percpu_count; | 196 | unsigned long __percpu *percpu_count; |
183 | int ret; | 197 | int ret; |
184 | 198 | ||
185 | rcu_read_lock_sched(); | 199 | rcu_read_lock_sched(); |
186 | 200 | ||
187 | if (__ref_is_percpu(ref, &percpu_count)) { | 201 | if (__ref_is_percpu(ref, &percpu_count)) { |
188 | this_cpu_inc(*percpu_count); | 202 | this_cpu_inc(*percpu_count); |
189 | ret = true; | 203 | ret = true; |
190 | } else { | 204 | } else { |
191 | ret = atomic_long_inc_not_zero(&ref->count); | 205 | ret = atomic_long_inc_not_zero(&ref->count); |
192 | } | 206 | } |
193 | 207 | ||
194 | rcu_read_unlock_sched(); | 208 | rcu_read_unlock_sched(); |
195 | 209 | ||
196 | return ret; | 210 | return ret; |
197 | } | 211 | } |
198 | 212 | ||
199 | /** | 213 | /** |
200 | * percpu_ref_tryget_live - try to increment a live percpu refcount | 214 | * percpu_ref_tryget_live - try to increment a live percpu refcount |
201 | * @ref: percpu_ref to try-get | 215 | * @ref: percpu_ref to try-get |
202 | * | 216 | * |
203 | * Increment a percpu refcount unless it has already been killed. Returns | 217 | * Increment a percpu refcount unless it has already been killed. Returns |
204 | * %true on success; %false on failure. | 218 | * %true on success; %false on failure. |
205 | * | 219 | * |
206 | * Completion of percpu_ref_kill() in itself doesn't guarantee that this | 220 | * Completion of percpu_ref_kill() in itself doesn't guarantee that this |
207 | * function will fail. For such guarantee, percpu_ref_kill_and_confirm() | 221 | * function will fail. For such guarantee, percpu_ref_kill_and_confirm() |
208 | * should be used. After the confirm_kill callback is invoked, it's | 222 | * should be used. After the confirm_kill callback is invoked, it's |
209 | * guaranteed that no new reference will be given out by | 223 | * guaranteed that no new reference will be given out by |
210 | * percpu_ref_tryget_live(). | 224 | * percpu_ref_tryget_live(). |
211 | * | 225 | * |
212 | * This function is safe to call as long as @ref is between init and exit. | 226 | * This function is safe to call as long as @ref is between init and exit. |
213 | */ | 227 | */ |
214 | static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) | 228 | static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) |
215 | { | 229 | { |
216 | unsigned long __percpu *percpu_count; | 230 | unsigned long __percpu *percpu_count; |
217 | int ret = false; | 231 | int ret = false; |
218 | 232 | ||
219 | rcu_read_lock_sched(); | 233 | rcu_read_lock_sched(); |
220 | 234 | ||
221 | if (__ref_is_percpu(ref, &percpu_count)) { | 235 | if (__ref_is_percpu(ref, &percpu_count)) { |
222 | this_cpu_inc(*percpu_count); | 236 | this_cpu_inc(*percpu_count); |
223 | ret = true; | 237 | ret = true; |
224 | } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { | 238 | } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { |
225 | ret = atomic_long_inc_not_zero(&ref->count); | 239 | ret = atomic_long_inc_not_zero(&ref->count); |
226 | } | 240 | } |
227 | 241 | ||
228 | rcu_read_unlock_sched(); | 242 | rcu_read_unlock_sched(); |
229 | 243 | ||
230 | return ret; | 244 | return ret; |
231 | } | 245 | } |
232 | 246 | ||
233 | /** | 247 | /** |
234 | * percpu_ref_put - decrement a percpu refcount | 248 | * percpu_ref_put_many - decrement a percpu refcount |
235 | * @ref: percpu_ref to put | 249 | * @ref: percpu_ref to put |
250 | * @nr: number of references to put | ||
236 | * | 251 | * |
237 | * Decrement the refcount, and if 0, call the release function (which was passed | 252 | * Decrement the refcount, and if 0, call the release function (which was passed |
238 | * to percpu_ref_init()) | 253 | * to percpu_ref_init()) |
239 | * | 254 | * |
240 | * This function is safe to call as long as @ref is between init and exit. | 255 | * This function is safe to call as long as @ref is between init and exit. |
241 | */ | 256 | */ |
242 | static inline void percpu_ref_put(struct percpu_ref *ref) | 257 | static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr) |
243 | { | 258 | { |
244 | unsigned long __percpu *percpu_count; | 259 | unsigned long __percpu *percpu_count; |
245 | 260 | ||
246 | rcu_read_lock_sched(); | 261 | rcu_read_lock_sched(); |
247 | 262 | ||
248 | if (__ref_is_percpu(ref, &percpu_count)) | 263 | if (__ref_is_percpu(ref, &percpu_count)) |
249 | this_cpu_dec(*percpu_count); | 264 | this_cpu_sub(*percpu_count, nr); |
250 | else if (unlikely(atomic_long_dec_and_test(&ref->count))) | 265 | else if (unlikely(atomic_long_sub_and_test(nr, &ref->count))) |
251 | ref->release(ref); | 266 | ref->release(ref); |
252 | 267 | ||
253 | rcu_read_unlock_sched(); | 268 | rcu_read_unlock_sched(); |
269 | } | ||
270 | |||
271 | /** | ||
272 | * percpu_ref_put - decrement a percpu refcount | ||
273 | * @ref: percpu_ref to put | ||
274 | * | ||
275 | * Decrement the refcount, and if 0, call the release function (which was passed | ||
276 | * to percpu_ref_init()) | ||
277 | * | ||
278 | * This function is safe to call as long as @ref is between init and exit. | ||
279 | */ | ||
280 | static inline void percpu_ref_put(struct percpu_ref *ref) | ||
281 | { | ||
282 | percpu_ref_put_many(ref, 1); | ||
254 | } | 283 | } |
255 | 284 | ||
256 | /** | 285 | /** |
257 | * percpu_ref_is_zero - test whether a percpu refcount reached zero | 286 | * percpu_ref_is_zero - test whether a percpu refcount reached zero |
258 | * @ref: percpu_ref to test | 287 | * @ref: percpu_ref to test |
259 | * | 288 | * |
260 | * Returns %true if @ref reached zero. | 289 | * Returns %true if @ref reached zero. |
261 | * | 290 | * |
262 | * This function is safe to call as long as @ref is between init and exit. | 291 | * This function is safe to call as long as @ref is between init and exit. |
263 | */ | 292 | */ |
264 | static inline bool percpu_ref_is_zero(struct percpu_ref *ref) | 293 | static inline bool percpu_ref_is_zero(struct percpu_ref *ref) |
265 | { | 294 | { |
266 | unsigned long __percpu *percpu_count; | 295 | unsigned long __percpu *percpu_count; |
267 | 296 | ||
268 | if (__ref_is_percpu(ref, &percpu_count)) | 297 | if (__ref_is_percpu(ref, &percpu_count)) |
269 | return false; | 298 | return false; |
270 | return !atomic_long_read(&ref->count); | 299 | return !atomic_long_read(&ref->count); |
271 | } | 300 | } |
272 | 301 | ||
273 | #endif | 302 | #endif |
274 | 303 |
mm/memcontrol.c
1 | /* memcontrol.c - Memory Controller | 1 | /* memcontrol.c - Memory Controller |
2 | * | 2 | * |
3 | * Copyright IBM Corporation, 2007 | 3 | * Copyright IBM Corporation, 2007 |
4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> | 4 | * Author Balbir Singh <balbir@linux.vnet.ibm.com> |
5 | * | 5 | * |
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * Memory thresholds | 9 | * Memory thresholds |
10 | * Copyright (C) 2009 Nokia Corporation | 10 | * Copyright (C) 2009 Nokia Corporation |
11 | * Author: Kirill A. Shutemov | 11 | * Author: Kirill A. Shutemov |
12 | * | 12 | * |
13 | * Kernel Memory Controller | 13 | * Kernel Memory Controller |
14 | * Copyright (C) 2012 Parallels Inc. and Google Inc. | 14 | * Copyright (C) 2012 Parallels Inc. and Google Inc. |
15 | * Authors: Glauber Costa and Suleiman Souhlal | 15 | * Authors: Glauber Costa and Suleiman Souhlal |
16 | * | 16 | * |
17 | * This program is free software; you can redistribute it and/or modify | 17 | * This program is free software; you can redistribute it and/or modify |
18 | * it under the terms of the GNU General Public License as published by | 18 | * it under the terms of the GNU General Public License as published by |
19 | * the Free Software Foundation; either version 2 of the License, or | 19 | * the Free Software Foundation; either version 2 of the License, or |
20 | * (at your option) any later version. | 20 | * (at your option) any later version. |
21 | * | 21 | * |
22 | * This program is distributed in the hope that it will be useful, | 22 | * This program is distributed in the hope that it will be useful, |
23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
25 | * GNU General Public License for more details. | 25 | * GNU General Public License for more details. |
26 | */ | 26 | */ |
27 | 27 | ||
28 | #include <linux/page_counter.h> | 28 | #include <linux/page_counter.h> |
29 | #include <linux/memcontrol.h> | 29 | #include <linux/memcontrol.h> |
30 | #include <linux/cgroup.h> | 30 | #include <linux/cgroup.h> |
31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
32 | #include <linux/hugetlb.h> | 32 | #include <linux/hugetlb.h> |
33 | #include <linux/pagemap.h> | 33 | #include <linux/pagemap.h> |
34 | #include <linux/smp.h> | 34 | #include <linux/smp.h> |
35 | #include <linux/page-flags.h> | 35 | #include <linux/page-flags.h> |
36 | #include <linux/backing-dev.h> | 36 | #include <linux/backing-dev.h> |
37 | #include <linux/bit_spinlock.h> | 37 | #include <linux/bit_spinlock.h> |
38 | #include <linux/rcupdate.h> | 38 | #include <linux/rcupdate.h> |
39 | #include <linux/limits.h> | 39 | #include <linux/limits.h> |
40 | #include <linux/export.h> | 40 | #include <linux/export.h> |
41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
42 | #include <linux/rbtree.h> | 42 | #include <linux/rbtree.h> |
43 | #include <linux/slab.h> | 43 | #include <linux/slab.h> |
44 | #include <linux/swap.h> | 44 | #include <linux/swap.h> |
45 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
46 | #include <linux/spinlock.h> | 46 | #include <linux/spinlock.h> |
47 | #include <linux/eventfd.h> | 47 | #include <linux/eventfd.h> |
48 | #include <linux/poll.h> | 48 | #include <linux/poll.h> |
49 | #include <linux/sort.h> | 49 | #include <linux/sort.h> |
50 | #include <linux/fs.h> | 50 | #include <linux/fs.h> |
51 | #include <linux/seq_file.h> | 51 | #include <linux/seq_file.h> |
52 | #include <linux/vmpressure.h> | 52 | #include <linux/vmpressure.h> |
53 | #include <linux/mm_inline.h> | 53 | #include <linux/mm_inline.h> |
54 | #include <linux/page_cgroup.h> | 54 | #include <linux/page_cgroup.h> |
55 | #include <linux/cpu.h> | 55 | #include <linux/cpu.h> |
56 | #include <linux/oom.h> | 56 | #include <linux/oom.h> |
57 | #include <linux/lockdep.h> | 57 | #include <linux/lockdep.h> |
58 | #include <linux/file.h> | 58 | #include <linux/file.h> |
59 | #include "internal.h" | 59 | #include "internal.h" |
60 | #include <net/sock.h> | 60 | #include <net/sock.h> |
61 | #include <net/ip.h> | 61 | #include <net/ip.h> |
62 | #include <net/tcp_memcontrol.h> | 62 | #include <net/tcp_memcontrol.h> |
63 | #include "slab.h" | 63 | #include "slab.h" |
64 | 64 | ||
65 | #include <asm/uaccess.h> | 65 | #include <asm/uaccess.h> |
66 | 66 | ||
67 | #include <trace/events/vmscan.h> | 67 | #include <trace/events/vmscan.h> |
68 | 68 | ||
69 | struct cgroup_subsys memory_cgrp_subsys __read_mostly; | 69 | struct cgroup_subsys memory_cgrp_subsys __read_mostly; |
70 | EXPORT_SYMBOL(memory_cgrp_subsys); | 70 | EXPORT_SYMBOL(memory_cgrp_subsys); |
71 | 71 | ||
72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 72 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 73 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
74 | 74 | ||
75 | #ifdef CONFIG_MEMCG_SWAP | 75 | #ifdef CONFIG_MEMCG_SWAP |
76 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 76 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
77 | int do_swap_account __read_mostly; | 77 | int do_swap_account __read_mostly; |
78 | 78 | ||
79 | /* for remember boot option*/ | 79 | /* for remember boot option*/ |
80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED | 80 | #ifdef CONFIG_MEMCG_SWAP_ENABLED |
81 | static int really_do_swap_account __initdata = 1; | 81 | static int really_do_swap_account __initdata = 1; |
82 | #else | 82 | #else |
83 | static int really_do_swap_account __initdata; | 83 | static int really_do_swap_account __initdata; |
84 | #endif | 84 | #endif |
85 | 85 | ||
86 | #else | 86 | #else |
87 | #define do_swap_account 0 | 87 | #define do_swap_account 0 |
88 | #endif | 88 | #endif |
89 | 89 | ||
90 | 90 | ||
91 | static const char * const mem_cgroup_stat_names[] = { | 91 | static const char * const mem_cgroup_stat_names[] = { |
92 | "cache", | 92 | "cache", |
93 | "rss", | 93 | "rss", |
94 | "rss_huge", | 94 | "rss_huge", |
95 | "mapped_file", | 95 | "mapped_file", |
96 | "writeback", | 96 | "writeback", |
97 | "swap", | 97 | "swap", |
98 | }; | 98 | }; |
99 | 99 | ||
100 | enum mem_cgroup_events_index { | 100 | enum mem_cgroup_events_index { |
101 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ | 101 | MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ |
102 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ | 102 | MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ |
103 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ | 103 | MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ |
104 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ | 104 | MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ |
105 | MEM_CGROUP_EVENTS_NSTATS, | 105 | MEM_CGROUP_EVENTS_NSTATS, |
106 | }; | 106 | }; |
107 | 107 | ||
108 | static const char * const mem_cgroup_events_names[] = { | 108 | static const char * const mem_cgroup_events_names[] = { |
109 | "pgpgin", | 109 | "pgpgin", |
110 | "pgpgout", | 110 | "pgpgout", |
111 | "pgfault", | 111 | "pgfault", |
112 | "pgmajfault", | 112 | "pgmajfault", |
113 | }; | 113 | }; |
114 | 114 | ||
115 | static const char * const mem_cgroup_lru_names[] = { | 115 | static const char * const mem_cgroup_lru_names[] = { |
116 | "inactive_anon", | 116 | "inactive_anon", |
117 | "active_anon", | 117 | "active_anon", |
118 | "inactive_file", | 118 | "inactive_file", |
119 | "active_file", | 119 | "active_file", |
120 | "unevictable", | 120 | "unevictable", |
121 | }; | 121 | }; |
122 | 122 | ||
123 | /* | 123 | /* |
124 | * Per memcg event counter is incremented at every pagein/pageout. With THP, | 124 | * Per memcg event counter is incremented at every pagein/pageout. With THP, |
125 | * it will be incremated by the number of pages. This counter is used for | 125 | * it will be incremated by the number of pages. This counter is used for |
126 | * for trigger some periodic events. This is straightforward and better | 126 | * for trigger some periodic events. This is straightforward and better |
127 | * than using jiffies etc. to handle periodic memcg event. | 127 | * than using jiffies etc. to handle periodic memcg event. |
128 | */ | 128 | */ |
129 | enum mem_cgroup_events_target { | 129 | enum mem_cgroup_events_target { |
130 | MEM_CGROUP_TARGET_THRESH, | 130 | MEM_CGROUP_TARGET_THRESH, |
131 | MEM_CGROUP_TARGET_SOFTLIMIT, | 131 | MEM_CGROUP_TARGET_SOFTLIMIT, |
132 | MEM_CGROUP_TARGET_NUMAINFO, | 132 | MEM_CGROUP_TARGET_NUMAINFO, |
133 | MEM_CGROUP_NTARGETS, | 133 | MEM_CGROUP_NTARGETS, |
134 | }; | 134 | }; |
135 | #define THRESHOLDS_EVENTS_TARGET 128 | 135 | #define THRESHOLDS_EVENTS_TARGET 128 |
136 | #define SOFTLIMIT_EVENTS_TARGET 1024 | 136 | #define SOFTLIMIT_EVENTS_TARGET 1024 |
137 | #define NUMAINFO_EVENTS_TARGET 1024 | 137 | #define NUMAINFO_EVENTS_TARGET 1024 |
138 | 138 | ||
139 | struct mem_cgroup_stat_cpu { | 139 | struct mem_cgroup_stat_cpu { |
140 | long count[MEM_CGROUP_STAT_NSTATS]; | 140 | long count[MEM_CGROUP_STAT_NSTATS]; |
141 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; | 141 | unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; |
142 | unsigned long nr_page_events; | 142 | unsigned long nr_page_events; |
143 | unsigned long targets[MEM_CGROUP_NTARGETS]; | 143 | unsigned long targets[MEM_CGROUP_NTARGETS]; |
144 | }; | 144 | }; |
145 | 145 | ||
146 | struct reclaim_iter { | 146 | struct reclaim_iter { |
147 | struct mem_cgroup *position; | 147 | struct mem_cgroup *position; |
148 | /* scan generation, increased every round-trip */ | 148 | /* scan generation, increased every round-trip */ |
149 | unsigned int generation; | 149 | unsigned int generation; |
150 | }; | 150 | }; |
151 | 151 | ||
152 | /* | 152 | /* |
153 | * per-zone information in memory controller. | 153 | * per-zone information in memory controller. |
154 | */ | 154 | */ |
155 | struct mem_cgroup_per_zone { | 155 | struct mem_cgroup_per_zone { |
156 | struct lruvec lruvec; | 156 | struct lruvec lruvec; |
157 | unsigned long lru_size[NR_LRU_LISTS]; | 157 | unsigned long lru_size[NR_LRU_LISTS]; |
158 | 158 | ||
159 | struct reclaim_iter iter[DEF_PRIORITY + 1]; | 159 | struct reclaim_iter iter[DEF_PRIORITY + 1]; |
160 | 160 | ||
161 | struct rb_node tree_node; /* RB tree node */ | 161 | struct rb_node tree_node; /* RB tree node */ |
162 | unsigned long usage_in_excess;/* Set to the value by which */ | 162 | unsigned long usage_in_excess;/* Set to the value by which */ |
163 | /* the soft limit is exceeded*/ | 163 | /* the soft limit is exceeded*/ |
164 | bool on_tree; | 164 | bool on_tree; |
165 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ | 165 | struct mem_cgroup *memcg; /* Back pointer, we cannot */ |
166 | /* use container_of */ | 166 | /* use container_of */ |
167 | }; | 167 | }; |
168 | 168 | ||
169 | struct mem_cgroup_per_node { | 169 | struct mem_cgroup_per_node { |
170 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; | 170 | struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; |
171 | }; | 171 | }; |
172 | 172 | ||
173 | /* | 173 | /* |
174 | * Cgroups above their limits are maintained in a RB-Tree, independent of | 174 | * Cgroups above their limits are maintained in a RB-Tree, independent of |
175 | * their hierarchy representation | 175 | * their hierarchy representation |
176 | */ | 176 | */ |
177 | 177 | ||
178 | struct mem_cgroup_tree_per_zone { | 178 | struct mem_cgroup_tree_per_zone { |
179 | struct rb_root rb_root; | 179 | struct rb_root rb_root; |
180 | spinlock_t lock; | 180 | spinlock_t lock; |
181 | }; | 181 | }; |
182 | 182 | ||
183 | struct mem_cgroup_tree_per_node { | 183 | struct mem_cgroup_tree_per_node { |
184 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | 184 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; |
185 | }; | 185 | }; |
186 | 186 | ||
187 | struct mem_cgroup_tree { | 187 | struct mem_cgroup_tree { |
188 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | 188 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; |
189 | }; | 189 | }; |
190 | 190 | ||
191 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 191 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
192 | 192 | ||
193 | struct mem_cgroup_threshold { | 193 | struct mem_cgroup_threshold { |
194 | struct eventfd_ctx *eventfd; | 194 | struct eventfd_ctx *eventfd; |
195 | unsigned long threshold; | 195 | unsigned long threshold; |
196 | }; | 196 | }; |
197 | 197 | ||
198 | /* For threshold */ | 198 | /* For threshold */ |
199 | struct mem_cgroup_threshold_ary { | 199 | struct mem_cgroup_threshold_ary { |
200 | /* An array index points to threshold just below or equal to usage. */ | 200 | /* An array index points to threshold just below or equal to usage. */ |
201 | int current_threshold; | 201 | int current_threshold; |
202 | /* Size of entries[] */ | 202 | /* Size of entries[] */ |
203 | unsigned int size; | 203 | unsigned int size; |
204 | /* Array of thresholds */ | 204 | /* Array of thresholds */ |
205 | struct mem_cgroup_threshold entries[0]; | 205 | struct mem_cgroup_threshold entries[0]; |
206 | }; | 206 | }; |
207 | 207 | ||
208 | struct mem_cgroup_thresholds { | 208 | struct mem_cgroup_thresholds { |
209 | /* Primary thresholds array */ | 209 | /* Primary thresholds array */ |
210 | struct mem_cgroup_threshold_ary *primary; | 210 | struct mem_cgroup_threshold_ary *primary; |
211 | /* | 211 | /* |
212 | * Spare threshold array. | 212 | * Spare threshold array. |
213 | * This is needed to make mem_cgroup_unregister_event() "never fail". | 213 | * This is needed to make mem_cgroup_unregister_event() "never fail". |
214 | * It must be able to store at least primary->size - 1 entries. | 214 | * It must be able to store at least primary->size - 1 entries. |
215 | */ | 215 | */ |
216 | struct mem_cgroup_threshold_ary *spare; | 216 | struct mem_cgroup_threshold_ary *spare; |
217 | }; | 217 | }; |
218 | 218 | ||
219 | /* for OOM */ | 219 | /* for OOM */ |
220 | struct mem_cgroup_eventfd_list { | 220 | struct mem_cgroup_eventfd_list { |
221 | struct list_head list; | 221 | struct list_head list; |
222 | struct eventfd_ctx *eventfd; | 222 | struct eventfd_ctx *eventfd; |
223 | }; | 223 | }; |
224 | 224 | ||
225 | /* | 225 | /* |
226 | * cgroup_event represents events which userspace want to receive. | 226 | * cgroup_event represents events which userspace want to receive. |
227 | */ | 227 | */ |
228 | struct mem_cgroup_event { | 228 | struct mem_cgroup_event { |
229 | /* | 229 | /* |
230 | * memcg which the event belongs to. | 230 | * memcg which the event belongs to. |
231 | */ | 231 | */ |
232 | struct mem_cgroup *memcg; | 232 | struct mem_cgroup *memcg; |
233 | /* | 233 | /* |
234 | * eventfd to signal userspace about the event. | 234 | * eventfd to signal userspace about the event. |
235 | */ | 235 | */ |
236 | struct eventfd_ctx *eventfd; | 236 | struct eventfd_ctx *eventfd; |
237 | /* | 237 | /* |
238 | * Each of these stored in a list by the cgroup. | 238 | * Each of these stored in a list by the cgroup. |
239 | */ | 239 | */ |
240 | struct list_head list; | 240 | struct list_head list; |
241 | /* | 241 | /* |
242 | * register_event() callback will be used to add new userspace | 242 | * register_event() callback will be used to add new userspace |
243 | * waiter for changes related to this event. Use eventfd_signal() | 243 | * waiter for changes related to this event. Use eventfd_signal() |
244 | * on eventfd to send notification to userspace. | 244 | * on eventfd to send notification to userspace. |
245 | */ | 245 | */ |
246 | int (*register_event)(struct mem_cgroup *memcg, | 246 | int (*register_event)(struct mem_cgroup *memcg, |
247 | struct eventfd_ctx *eventfd, const char *args); | 247 | struct eventfd_ctx *eventfd, const char *args); |
248 | /* | 248 | /* |
249 | * unregister_event() callback will be called when userspace closes | 249 | * unregister_event() callback will be called when userspace closes |
250 | * the eventfd or on cgroup removing. This callback must be set, | 250 | * the eventfd or on cgroup removing. This callback must be set, |
251 | * if you want provide notification functionality. | 251 | * if you want provide notification functionality. |
252 | */ | 252 | */ |
253 | void (*unregister_event)(struct mem_cgroup *memcg, | 253 | void (*unregister_event)(struct mem_cgroup *memcg, |
254 | struct eventfd_ctx *eventfd); | 254 | struct eventfd_ctx *eventfd); |
255 | /* | 255 | /* |
256 | * All fields below needed to unregister event when | 256 | * All fields below needed to unregister event when |
257 | * userspace closes eventfd. | 257 | * userspace closes eventfd. |
258 | */ | 258 | */ |
259 | poll_table pt; | 259 | poll_table pt; |
260 | wait_queue_head_t *wqh; | 260 | wait_queue_head_t *wqh; |
261 | wait_queue_t wait; | 261 | wait_queue_t wait; |
262 | struct work_struct remove; | 262 | struct work_struct remove; |
263 | }; | 263 | }; |
264 | 264 | ||
265 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); | 265 | static void mem_cgroup_threshold(struct mem_cgroup *memcg); |
266 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | 266 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); |
267 | 267 | ||
268 | /* | 268 | /* |
269 | * The memory controller data structure. The memory controller controls both | 269 | * The memory controller data structure. The memory controller controls both |
270 | * page cache and RSS per cgroup. We would eventually like to provide | 270 | * page cache and RSS per cgroup. We would eventually like to provide |
271 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 271 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
272 | * to help the administrator determine what knobs to tune. | 272 | * to help the administrator determine what knobs to tune. |
273 | * | 273 | * |
274 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | 274 | * TODO: Add a water mark for the memory controller. Reclaim will begin when |
275 | * we hit the water mark. May be even add a low water mark, such that | 275 | * we hit the water mark. May be even add a low water mark, such that |
276 | * no reclaim occurs from a cgroup at it's low water mark, this is | 276 | * no reclaim occurs from a cgroup at it's low water mark, this is |
277 | * a feature that will be implemented much later in the future. | 277 | * a feature that will be implemented much later in the future. |
278 | */ | 278 | */ |
279 | struct mem_cgroup { | 279 | struct mem_cgroup { |
280 | struct cgroup_subsys_state css; | 280 | struct cgroup_subsys_state css; |
281 | 281 | ||
282 | /* Accounted resources */ | 282 | /* Accounted resources */ |
283 | struct page_counter memory; | 283 | struct page_counter memory; |
284 | struct page_counter memsw; | 284 | struct page_counter memsw; |
285 | struct page_counter kmem; | 285 | struct page_counter kmem; |
286 | 286 | ||
287 | unsigned long soft_limit; | 287 | unsigned long soft_limit; |
288 | 288 | ||
289 | /* vmpressure notifications */ | 289 | /* vmpressure notifications */ |
290 | struct vmpressure vmpressure; | 290 | struct vmpressure vmpressure; |
291 | 291 | ||
292 | /* css_online() has been completed */ | 292 | /* css_online() has been completed */ |
293 | int initialized; | 293 | int initialized; |
294 | 294 | ||
295 | /* | 295 | /* |
296 | * Should the accounting and control be hierarchical, per subtree? | 296 | * Should the accounting and control be hierarchical, per subtree? |
297 | */ | 297 | */ |
298 | bool use_hierarchy; | 298 | bool use_hierarchy; |
299 | unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ | 299 | unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ |
300 | 300 | ||
301 | bool oom_lock; | 301 | bool oom_lock; |
302 | atomic_t under_oom; | 302 | atomic_t under_oom; |
303 | atomic_t oom_wakeups; | 303 | atomic_t oom_wakeups; |
304 | 304 | ||
305 | int swappiness; | 305 | int swappiness; |
306 | /* OOM-Killer disable */ | 306 | /* OOM-Killer disable */ |
307 | int oom_kill_disable; | 307 | int oom_kill_disable; |
308 | 308 | ||
309 | /* protect arrays of thresholds */ | 309 | /* protect arrays of thresholds */ |
310 | struct mutex thresholds_lock; | 310 | struct mutex thresholds_lock; |
311 | 311 | ||
312 | /* thresholds for memory usage. RCU-protected */ | 312 | /* thresholds for memory usage. RCU-protected */ |
313 | struct mem_cgroup_thresholds thresholds; | 313 | struct mem_cgroup_thresholds thresholds; |
314 | 314 | ||
315 | /* thresholds for mem+swap usage. RCU-protected */ | 315 | /* thresholds for mem+swap usage. RCU-protected */ |
316 | struct mem_cgroup_thresholds memsw_thresholds; | 316 | struct mem_cgroup_thresholds memsw_thresholds; |
317 | 317 | ||
318 | /* For oom notifier event fd */ | 318 | /* For oom notifier event fd */ |
319 | struct list_head oom_notify; | 319 | struct list_head oom_notify; |
320 | 320 | ||
321 | /* | 321 | /* |
322 | * Should we move charges of a task when a task is moved into this | 322 | * Should we move charges of a task when a task is moved into this |
323 | * mem_cgroup ? And what type of charges should we move ? | 323 | * mem_cgroup ? And what type of charges should we move ? |
324 | */ | 324 | */ |
325 | unsigned long move_charge_at_immigrate; | 325 | unsigned long move_charge_at_immigrate; |
326 | /* | 326 | /* |
327 | * set > 0 if pages under this cgroup are moving to other cgroup. | 327 | * set > 0 if pages under this cgroup are moving to other cgroup. |
328 | */ | 328 | */ |
329 | atomic_t moving_account; | 329 | atomic_t moving_account; |
330 | /* taken only while moving_account > 0 */ | 330 | /* taken only while moving_account > 0 */ |
331 | spinlock_t move_lock; | 331 | spinlock_t move_lock; |
332 | /* | 332 | /* |
333 | * percpu counter. | 333 | * percpu counter. |
334 | */ | 334 | */ |
335 | struct mem_cgroup_stat_cpu __percpu *stat; | 335 | struct mem_cgroup_stat_cpu __percpu *stat; |
336 | /* | 336 | /* |
337 | * used when a cpu is offlined or other synchronizations | 337 | * used when a cpu is offlined or other synchronizations |
338 | * See mem_cgroup_read_stat(). | 338 | * See mem_cgroup_read_stat(). |
339 | */ | 339 | */ |
340 | struct mem_cgroup_stat_cpu nocpu_base; | 340 | struct mem_cgroup_stat_cpu nocpu_base; |
341 | spinlock_t pcp_counter_lock; | 341 | spinlock_t pcp_counter_lock; |
342 | 342 | ||
343 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) | 343 | #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) |
344 | struct cg_proto tcp_mem; | 344 | struct cg_proto tcp_mem; |
345 | #endif | 345 | #endif |
346 | #if defined(CONFIG_MEMCG_KMEM) | 346 | #if defined(CONFIG_MEMCG_KMEM) |
347 | /* analogous to slab_common's slab_caches list, but per-memcg; | 347 | /* analogous to slab_common's slab_caches list, but per-memcg; |
348 | * protected by memcg_slab_mutex */ | 348 | * protected by memcg_slab_mutex */ |
349 | struct list_head memcg_slab_caches; | 349 | struct list_head memcg_slab_caches; |
350 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ | 350 | /* Index in the kmem_cache->memcg_params->memcg_caches array */ |
351 | int kmemcg_id; | 351 | int kmemcg_id; |
352 | #endif | 352 | #endif |
353 | 353 | ||
354 | int last_scanned_node; | 354 | int last_scanned_node; |
355 | #if MAX_NUMNODES > 1 | 355 | #if MAX_NUMNODES > 1 |
356 | nodemask_t scan_nodes; | 356 | nodemask_t scan_nodes; |
357 | atomic_t numainfo_events; | 357 | atomic_t numainfo_events; |
358 | atomic_t numainfo_updating; | 358 | atomic_t numainfo_updating; |
359 | #endif | 359 | #endif |
360 | 360 | ||
361 | /* List of events which userspace want to receive */ | 361 | /* List of events which userspace want to receive */ |
362 | struct list_head event_list; | 362 | struct list_head event_list; |
363 | spinlock_t event_list_lock; | 363 | spinlock_t event_list_lock; |
364 | 364 | ||
365 | struct mem_cgroup_per_node *nodeinfo[0]; | 365 | struct mem_cgroup_per_node *nodeinfo[0]; |
366 | /* WARNING: nodeinfo must be the last member here */ | 366 | /* WARNING: nodeinfo must be the last member here */ |
367 | }; | 367 | }; |
368 | 368 | ||
369 | /* internal only representation about the status of kmem accounting. */ | 369 | /* internal only representation about the status of kmem accounting. */ |
370 | enum { | 370 | enum { |
371 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ | 371 | KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ |
372 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ | 372 | KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ |
373 | }; | 373 | }; |
374 | 374 | ||
375 | #ifdef CONFIG_MEMCG_KMEM | 375 | #ifdef CONFIG_MEMCG_KMEM |
376 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) | 376 | static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) |
377 | { | 377 | { |
378 | set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 378 | set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); |
379 | } | 379 | } |
380 | 380 | ||
381 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) | 381 | static bool memcg_kmem_is_active(struct mem_cgroup *memcg) |
382 | { | 382 | { |
383 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); | 383 | return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); |
384 | } | 384 | } |
385 | 385 | ||
386 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) | 386 | static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) |
387 | { | 387 | { |
388 | /* | 388 | /* |
389 | * Our caller must use css_get() first, because memcg_uncharge_kmem() | 389 | * Our caller must use css_get() first, because memcg_uncharge_kmem() |
390 | * will call css_put() if it sees the memcg is dead. | 390 | * will call css_put() if it sees the memcg is dead. |
391 | */ | 391 | */ |
392 | smp_wmb(); | 392 | smp_wmb(); |
393 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) | 393 | if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) |
394 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); | 394 | set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); |
395 | } | 395 | } |
396 | 396 | ||
397 | static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) | 397 | static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) |
398 | { | 398 | { |
399 | return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, | 399 | return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, |
400 | &memcg->kmem_account_flags); | 400 | &memcg->kmem_account_flags); |
401 | } | 401 | } |
402 | #endif | 402 | #endif |
403 | 403 | ||
404 | /* Stuffs for move charges at task migration. */ | 404 | /* Stuffs for move charges at task migration. */ |
405 | /* | 405 | /* |
406 | * Types of charges to be moved. "move_charge_at_immitgrate" and | 406 | * Types of charges to be moved. "move_charge_at_immitgrate" and |
407 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. | 407 | * "immigrate_flags" are treated as a left-shifted bitmap of these types. |
408 | */ | 408 | */ |
409 | enum move_type { | 409 | enum move_type { |
410 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 410 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
411 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | 411 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ |
412 | NR_MOVE_TYPE, | 412 | NR_MOVE_TYPE, |
413 | }; | 413 | }; |
414 | 414 | ||
415 | /* "mc" and its members are protected by cgroup_mutex */ | 415 | /* "mc" and its members are protected by cgroup_mutex */ |
416 | static struct move_charge_struct { | 416 | static struct move_charge_struct { |
417 | spinlock_t lock; /* for from, to */ | 417 | spinlock_t lock; /* for from, to */ |
418 | struct mem_cgroup *from; | 418 | struct mem_cgroup *from; |
419 | struct mem_cgroup *to; | 419 | struct mem_cgroup *to; |
420 | unsigned long immigrate_flags; | 420 | unsigned long immigrate_flags; |
421 | unsigned long precharge; | 421 | unsigned long precharge; |
422 | unsigned long moved_charge; | 422 | unsigned long moved_charge; |
423 | unsigned long moved_swap; | 423 | unsigned long moved_swap; |
424 | struct task_struct *moving_task; /* a task moving charges */ | 424 | struct task_struct *moving_task; /* a task moving charges */ |
425 | wait_queue_head_t waitq; /* a waitq for other context */ | 425 | wait_queue_head_t waitq; /* a waitq for other context */ |
426 | } mc = { | 426 | } mc = { |
427 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), | 427 | .lock = __SPIN_LOCK_UNLOCKED(mc.lock), |
428 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 428 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
429 | }; | 429 | }; |
430 | 430 | ||
431 | static bool move_anon(void) | 431 | static bool move_anon(void) |
432 | { | 432 | { |
433 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); | 433 | return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); |
434 | } | 434 | } |
435 | 435 | ||
436 | static bool move_file(void) | 436 | static bool move_file(void) |
437 | { | 437 | { |
438 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); | 438 | return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); |
439 | } | 439 | } |
440 | 440 | ||
441 | /* | 441 | /* |
442 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 442 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
443 | * limit reclaim to prevent infinite loops, if they ever occur. | 443 | * limit reclaim to prevent infinite loops, if they ever occur. |
444 | */ | 444 | */ |
445 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 | 445 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 |
446 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 | 446 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 |
447 | 447 | ||
448 | enum charge_type { | 448 | enum charge_type { |
449 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 449 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
450 | MEM_CGROUP_CHARGE_TYPE_ANON, | 450 | MEM_CGROUP_CHARGE_TYPE_ANON, |
451 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 451 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
452 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | 452 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ |
453 | NR_CHARGE_TYPE, | 453 | NR_CHARGE_TYPE, |
454 | }; | 454 | }; |
455 | 455 | ||
456 | /* for encoding cft->private value on file */ | 456 | /* for encoding cft->private value on file */ |
457 | enum res_type { | 457 | enum res_type { |
458 | _MEM, | 458 | _MEM, |
459 | _MEMSWAP, | 459 | _MEMSWAP, |
460 | _OOM_TYPE, | 460 | _OOM_TYPE, |
461 | _KMEM, | 461 | _KMEM, |
462 | }; | 462 | }; |
463 | 463 | ||
464 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) | 464 | #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) |
465 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) | 465 | #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) |
466 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 466 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
467 | /* Used for OOM nofiier */ | 467 | /* Used for OOM nofiier */ |
468 | #define OOM_CONTROL (0) | 468 | #define OOM_CONTROL (0) |
469 | 469 | ||
470 | /* | 470 | /* |
471 | * The memcg_create_mutex will be held whenever a new cgroup is created. | 471 | * The memcg_create_mutex will be held whenever a new cgroup is created. |
472 | * As a consequence, any change that needs to protect against new child cgroups | 472 | * As a consequence, any change that needs to protect against new child cgroups |
473 | * appearing has to hold it as well. | 473 | * appearing has to hold it as well. |
474 | */ | 474 | */ |
475 | static DEFINE_MUTEX(memcg_create_mutex); | 475 | static DEFINE_MUTEX(memcg_create_mutex); |
476 | 476 | ||
477 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) | 477 | struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) |
478 | { | 478 | { |
479 | return s ? container_of(s, struct mem_cgroup, css) : NULL; | 479 | return s ? container_of(s, struct mem_cgroup, css) : NULL; |
480 | } | 480 | } |
481 | 481 | ||
482 | /* Some nice accessors for the vmpressure. */ | 482 | /* Some nice accessors for the vmpressure. */ |
483 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) | 483 | struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) |
484 | { | 484 | { |
485 | if (!memcg) | 485 | if (!memcg) |
486 | memcg = root_mem_cgroup; | 486 | memcg = root_mem_cgroup; |
487 | return &memcg->vmpressure; | 487 | return &memcg->vmpressure; |
488 | } | 488 | } |
489 | 489 | ||
490 | struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) | 490 | struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) |
491 | { | 491 | { |
492 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; | 492 | return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; |
493 | } | 493 | } |
494 | 494 | ||
495 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) | 495 | static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) |
496 | { | 496 | { |
497 | return (memcg == root_mem_cgroup); | 497 | return (memcg == root_mem_cgroup); |
498 | } | 498 | } |
499 | 499 | ||
500 | /* | 500 | /* |
501 | * We restrict the id in the range of [1, 65535], so it can fit into | 501 | * We restrict the id in the range of [1, 65535], so it can fit into |
502 | * an unsigned short. | 502 | * an unsigned short. |
503 | */ | 503 | */ |
504 | #define MEM_CGROUP_ID_MAX USHRT_MAX | 504 | #define MEM_CGROUP_ID_MAX USHRT_MAX |
505 | 505 | ||
506 | static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) | 506 | static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) |
507 | { | 507 | { |
508 | return memcg->css.id; | 508 | return memcg->css.id; |
509 | } | 509 | } |
510 | 510 | ||
511 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | 511 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) |
512 | { | 512 | { |
513 | struct cgroup_subsys_state *css; | 513 | struct cgroup_subsys_state *css; |
514 | 514 | ||
515 | css = css_from_id(id, &memory_cgrp_subsys); | 515 | css = css_from_id(id, &memory_cgrp_subsys); |
516 | return mem_cgroup_from_css(css); | 516 | return mem_cgroup_from_css(css); |
517 | } | 517 | } |
518 | 518 | ||
519 | /* Writing them here to avoid exposing memcg's inner layout */ | 519 | /* Writing them here to avoid exposing memcg's inner layout */ |
520 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) | 520 | #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) |
521 | 521 | ||
522 | void sock_update_memcg(struct sock *sk) | 522 | void sock_update_memcg(struct sock *sk) |
523 | { | 523 | { |
524 | if (mem_cgroup_sockets_enabled) { | 524 | if (mem_cgroup_sockets_enabled) { |
525 | struct mem_cgroup *memcg; | 525 | struct mem_cgroup *memcg; |
526 | struct cg_proto *cg_proto; | 526 | struct cg_proto *cg_proto; |
527 | 527 | ||
528 | BUG_ON(!sk->sk_prot->proto_cgroup); | 528 | BUG_ON(!sk->sk_prot->proto_cgroup); |
529 | 529 | ||
530 | /* Socket cloning can throw us here with sk_cgrp already | 530 | /* Socket cloning can throw us here with sk_cgrp already |
531 | * filled. It won't however, necessarily happen from | 531 | * filled. It won't however, necessarily happen from |
532 | * process context. So the test for root memcg given | 532 | * process context. So the test for root memcg given |
533 | * the current task's memcg won't help us in this case. | 533 | * the current task's memcg won't help us in this case. |
534 | * | 534 | * |
535 | * Respecting the original socket's memcg is a better | 535 | * Respecting the original socket's memcg is a better |
536 | * decision in this case. | 536 | * decision in this case. |
537 | */ | 537 | */ |
538 | if (sk->sk_cgrp) { | 538 | if (sk->sk_cgrp) { |
539 | BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); | 539 | BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); |
540 | css_get(&sk->sk_cgrp->memcg->css); | 540 | css_get(&sk->sk_cgrp->memcg->css); |
541 | return; | 541 | return; |
542 | } | 542 | } |
543 | 543 | ||
544 | rcu_read_lock(); | 544 | rcu_read_lock(); |
545 | memcg = mem_cgroup_from_task(current); | 545 | memcg = mem_cgroup_from_task(current); |
546 | cg_proto = sk->sk_prot->proto_cgroup(memcg); | 546 | cg_proto = sk->sk_prot->proto_cgroup(memcg); |
547 | if (!mem_cgroup_is_root(memcg) && | 547 | if (!mem_cgroup_is_root(memcg) && |
548 | memcg_proto_active(cg_proto) && | 548 | memcg_proto_active(cg_proto) && |
549 | css_tryget_online(&memcg->css)) { | 549 | css_tryget_online(&memcg->css)) { |
550 | sk->sk_cgrp = cg_proto; | 550 | sk->sk_cgrp = cg_proto; |
551 | } | 551 | } |
552 | rcu_read_unlock(); | 552 | rcu_read_unlock(); |
553 | } | 553 | } |
554 | } | 554 | } |
555 | EXPORT_SYMBOL(sock_update_memcg); | 555 | EXPORT_SYMBOL(sock_update_memcg); |
556 | 556 | ||
557 | void sock_release_memcg(struct sock *sk) | 557 | void sock_release_memcg(struct sock *sk) |
558 | { | 558 | { |
559 | if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { | 559 | if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { |
560 | struct mem_cgroup *memcg; | 560 | struct mem_cgroup *memcg; |
561 | WARN_ON(!sk->sk_cgrp->memcg); | 561 | WARN_ON(!sk->sk_cgrp->memcg); |
562 | memcg = sk->sk_cgrp->memcg; | 562 | memcg = sk->sk_cgrp->memcg; |
563 | css_put(&sk->sk_cgrp->memcg->css); | 563 | css_put(&sk->sk_cgrp->memcg->css); |
564 | } | 564 | } |
565 | } | 565 | } |
566 | 566 | ||
567 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) | 567 | struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) |
568 | { | 568 | { |
569 | if (!memcg || mem_cgroup_is_root(memcg)) | 569 | if (!memcg || mem_cgroup_is_root(memcg)) |
570 | return NULL; | 570 | return NULL; |
571 | 571 | ||
572 | return &memcg->tcp_mem; | 572 | return &memcg->tcp_mem; |
573 | } | 573 | } |
574 | EXPORT_SYMBOL(tcp_proto_cgroup); | 574 | EXPORT_SYMBOL(tcp_proto_cgroup); |
575 | 575 | ||
576 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 576 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
577 | { | 577 | { |
578 | if (!memcg_proto_activated(&memcg->tcp_mem)) | 578 | if (!memcg_proto_activated(&memcg->tcp_mem)) |
579 | return; | 579 | return; |
580 | static_key_slow_dec(&memcg_socket_limit_enabled); | 580 | static_key_slow_dec(&memcg_socket_limit_enabled); |
581 | } | 581 | } |
582 | #else | 582 | #else |
583 | static void disarm_sock_keys(struct mem_cgroup *memcg) | 583 | static void disarm_sock_keys(struct mem_cgroup *memcg) |
584 | { | 584 | { |
585 | } | 585 | } |
586 | #endif | 586 | #endif |
587 | 587 | ||
588 | #ifdef CONFIG_MEMCG_KMEM | 588 | #ifdef CONFIG_MEMCG_KMEM |
589 | /* | 589 | /* |
590 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. | 590 | * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. |
591 | * The main reason for not using cgroup id for this: | 591 | * The main reason for not using cgroup id for this: |
592 | * this works better in sparse environments, where we have a lot of memcgs, | 592 | * this works better in sparse environments, where we have a lot of memcgs, |
593 | * but only a few kmem-limited. Or also, if we have, for instance, 200 | 593 | * but only a few kmem-limited. Or also, if we have, for instance, 200 |
594 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a | 594 | * memcgs, and none but the 200th is kmem-limited, we'd have to have a |
595 | * 200 entry array for that. | 595 | * 200 entry array for that. |
596 | * | 596 | * |
597 | * The current size of the caches array is stored in | 597 | * The current size of the caches array is stored in |
598 | * memcg_limited_groups_array_size. It will double each time we have to | 598 | * memcg_limited_groups_array_size. It will double each time we have to |
599 | * increase it. | 599 | * increase it. |
600 | */ | 600 | */ |
601 | static DEFINE_IDA(kmem_limited_groups); | 601 | static DEFINE_IDA(kmem_limited_groups); |
602 | int memcg_limited_groups_array_size; | 602 | int memcg_limited_groups_array_size; |
603 | 603 | ||
604 | /* | 604 | /* |
605 | * MIN_SIZE is different than 1, because we would like to avoid going through | 605 | * MIN_SIZE is different than 1, because we would like to avoid going through |
606 | * the alloc/free process all the time. In a small machine, 4 kmem-limited | 606 | * the alloc/free process all the time. In a small machine, 4 kmem-limited |
607 | * cgroups is a reasonable guess. In the future, it could be a parameter or | 607 | * cgroups is a reasonable guess. In the future, it could be a parameter or |
608 | * tunable, but that is strictly not necessary. | 608 | * tunable, but that is strictly not necessary. |
609 | * | 609 | * |
610 | * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get | 610 | * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get |
611 | * this constant directly from cgroup, but it is understandable that this is | 611 | * this constant directly from cgroup, but it is understandable that this is |
612 | * better kept as an internal representation in cgroup.c. In any case, the | 612 | * better kept as an internal representation in cgroup.c. In any case, the |
613 | * cgrp_id space is not getting any smaller, and we don't have to necessarily | 613 | * cgrp_id space is not getting any smaller, and we don't have to necessarily |
614 | * increase ours as well if it increases. | 614 | * increase ours as well if it increases. |
615 | */ | 615 | */ |
616 | #define MEMCG_CACHES_MIN_SIZE 4 | 616 | #define MEMCG_CACHES_MIN_SIZE 4 |
617 | #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX | 617 | #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX |
618 | 618 | ||
619 | /* | 619 | /* |
620 | * A lot of the calls to the cache allocation functions are expected to be | 620 | * A lot of the calls to the cache allocation functions are expected to be |
621 | * inlined by the compiler. Since the calls to memcg_kmem_get_cache are | 621 | * inlined by the compiler. Since the calls to memcg_kmem_get_cache are |
622 | * conditional to this static branch, we'll have to allow modules that does | 622 | * conditional to this static branch, we'll have to allow modules that does |
623 | * kmem_cache_alloc and the such to see this symbol as well | 623 | * kmem_cache_alloc and the such to see this symbol as well |
624 | */ | 624 | */ |
625 | struct static_key memcg_kmem_enabled_key; | 625 | struct static_key memcg_kmem_enabled_key; |
626 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 626 | EXPORT_SYMBOL(memcg_kmem_enabled_key); |
627 | 627 | ||
628 | static void memcg_free_cache_id(int id); | 628 | static void memcg_free_cache_id(int id); |
629 | 629 | ||
630 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 630 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
631 | { | 631 | { |
632 | if (memcg_kmem_is_active(memcg)) { | 632 | if (memcg_kmem_is_active(memcg)) { |
633 | static_key_slow_dec(&memcg_kmem_enabled_key); | 633 | static_key_slow_dec(&memcg_kmem_enabled_key); |
634 | memcg_free_cache_id(memcg->kmemcg_id); | 634 | memcg_free_cache_id(memcg->kmemcg_id); |
635 | } | 635 | } |
636 | /* | 636 | /* |
637 | * This check can't live in kmem destruction function, | 637 | * This check can't live in kmem destruction function, |
638 | * since the charges will outlive the cgroup | 638 | * since the charges will outlive the cgroup |
639 | */ | 639 | */ |
640 | WARN_ON(page_counter_read(&memcg->kmem)); | 640 | WARN_ON(page_counter_read(&memcg->kmem)); |
641 | } | 641 | } |
642 | #else | 642 | #else |
643 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 643 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
644 | { | 644 | { |
645 | } | 645 | } |
646 | #endif /* CONFIG_MEMCG_KMEM */ | 646 | #endif /* CONFIG_MEMCG_KMEM */ |
647 | 647 | ||
648 | static void disarm_static_keys(struct mem_cgroup *memcg) | 648 | static void disarm_static_keys(struct mem_cgroup *memcg) |
649 | { | 649 | { |
650 | disarm_sock_keys(memcg); | 650 | disarm_sock_keys(memcg); |
651 | disarm_kmem_keys(memcg); | 651 | disarm_kmem_keys(memcg); |
652 | } | 652 | } |
653 | 653 | ||
654 | static void drain_all_stock_async(struct mem_cgroup *memcg); | 654 | static void drain_all_stock_async(struct mem_cgroup *memcg); |
655 | 655 | ||
656 | static struct mem_cgroup_per_zone * | 656 | static struct mem_cgroup_per_zone * |
657 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) | 657 | mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) |
658 | { | 658 | { |
659 | int nid = zone_to_nid(zone); | 659 | int nid = zone_to_nid(zone); |
660 | int zid = zone_idx(zone); | 660 | int zid = zone_idx(zone); |
661 | 661 | ||
662 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; | 662 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; |
663 | } | 663 | } |
664 | 664 | ||
665 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) | 665 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) |
666 | { | 666 | { |
667 | return &memcg->css; | 667 | return &memcg->css; |
668 | } | 668 | } |
669 | 669 | ||
670 | static struct mem_cgroup_per_zone * | 670 | static struct mem_cgroup_per_zone * |
671 | mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) | 671 | mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) |
672 | { | 672 | { |
673 | int nid = page_to_nid(page); | 673 | int nid = page_to_nid(page); |
674 | int zid = page_zonenum(page); | 674 | int zid = page_zonenum(page); |
675 | 675 | ||
676 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; | 676 | return &memcg->nodeinfo[nid]->zoneinfo[zid]; |
677 | } | 677 | } |
678 | 678 | ||
679 | static struct mem_cgroup_tree_per_zone * | 679 | static struct mem_cgroup_tree_per_zone * |
680 | soft_limit_tree_node_zone(int nid, int zid) | 680 | soft_limit_tree_node_zone(int nid, int zid) |
681 | { | 681 | { |
682 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | 682 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; |
683 | } | 683 | } |
684 | 684 | ||
685 | static struct mem_cgroup_tree_per_zone * | 685 | static struct mem_cgroup_tree_per_zone * |
686 | soft_limit_tree_from_page(struct page *page) | 686 | soft_limit_tree_from_page(struct page *page) |
687 | { | 687 | { |
688 | int nid = page_to_nid(page); | 688 | int nid = page_to_nid(page); |
689 | int zid = page_zonenum(page); | 689 | int zid = page_zonenum(page); |
690 | 690 | ||
691 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | 691 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; |
692 | } | 692 | } |
693 | 693 | ||
694 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, | 694 | static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, |
695 | struct mem_cgroup_tree_per_zone *mctz, | 695 | struct mem_cgroup_tree_per_zone *mctz, |
696 | unsigned long new_usage_in_excess) | 696 | unsigned long new_usage_in_excess) |
697 | { | 697 | { |
698 | struct rb_node **p = &mctz->rb_root.rb_node; | 698 | struct rb_node **p = &mctz->rb_root.rb_node; |
699 | struct rb_node *parent = NULL; | 699 | struct rb_node *parent = NULL; |
700 | struct mem_cgroup_per_zone *mz_node; | 700 | struct mem_cgroup_per_zone *mz_node; |
701 | 701 | ||
702 | if (mz->on_tree) | 702 | if (mz->on_tree) |
703 | return; | 703 | return; |
704 | 704 | ||
705 | mz->usage_in_excess = new_usage_in_excess; | 705 | mz->usage_in_excess = new_usage_in_excess; |
706 | if (!mz->usage_in_excess) | 706 | if (!mz->usage_in_excess) |
707 | return; | 707 | return; |
708 | while (*p) { | 708 | while (*p) { |
709 | parent = *p; | 709 | parent = *p; |
710 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | 710 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, |
711 | tree_node); | 711 | tree_node); |
712 | if (mz->usage_in_excess < mz_node->usage_in_excess) | 712 | if (mz->usage_in_excess < mz_node->usage_in_excess) |
713 | p = &(*p)->rb_left; | 713 | p = &(*p)->rb_left; |
714 | /* | 714 | /* |
715 | * We can't avoid mem cgroups that are over their soft | 715 | * We can't avoid mem cgroups that are over their soft |
716 | * limit by the same amount | 716 | * limit by the same amount |
717 | */ | 717 | */ |
718 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | 718 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) |
719 | p = &(*p)->rb_right; | 719 | p = &(*p)->rb_right; |
720 | } | 720 | } |
721 | rb_link_node(&mz->tree_node, parent, p); | 721 | rb_link_node(&mz->tree_node, parent, p); |
722 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | 722 | rb_insert_color(&mz->tree_node, &mctz->rb_root); |
723 | mz->on_tree = true; | 723 | mz->on_tree = true; |
724 | } | 724 | } |
725 | 725 | ||
726 | static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | 726 | static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, |
727 | struct mem_cgroup_tree_per_zone *mctz) | 727 | struct mem_cgroup_tree_per_zone *mctz) |
728 | { | 728 | { |
729 | if (!mz->on_tree) | 729 | if (!mz->on_tree) |
730 | return; | 730 | return; |
731 | rb_erase(&mz->tree_node, &mctz->rb_root); | 731 | rb_erase(&mz->tree_node, &mctz->rb_root); |
732 | mz->on_tree = false; | 732 | mz->on_tree = false; |
733 | } | 733 | } |
734 | 734 | ||
735 | static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | 735 | static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, |
736 | struct mem_cgroup_tree_per_zone *mctz) | 736 | struct mem_cgroup_tree_per_zone *mctz) |
737 | { | 737 | { |
738 | unsigned long flags; | 738 | unsigned long flags; |
739 | 739 | ||
740 | spin_lock_irqsave(&mctz->lock, flags); | 740 | spin_lock_irqsave(&mctz->lock, flags); |
741 | __mem_cgroup_remove_exceeded(mz, mctz); | 741 | __mem_cgroup_remove_exceeded(mz, mctz); |
742 | spin_unlock_irqrestore(&mctz->lock, flags); | 742 | spin_unlock_irqrestore(&mctz->lock, flags); |
743 | } | 743 | } |
744 | 744 | ||
745 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | 745 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) |
746 | { | 746 | { |
747 | unsigned long nr_pages = page_counter_read(&memcg->memory); | 747 | unsigned long nr_pages = page_counter_read(&memcg->memory); |
748 | unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); | 748 | unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); |
749 | unsigned long excess = 0; | 749 | unsigned long excess = 0; |
750 | 750 | ||
751 | if (nr_pages > soft_limit) | 751 | if (nr_pages > soft_limit) |
752 | excess = nr_pages - soft_limit; | 752 | excess = nr_pages - soft_limit; |
753 | 753 | ||
754 | return excess; | 754 | return excess; |
755 | } | 755 | } |
756 | 756 | ||
757 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) | 757 | static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) |
758 | { | 758 | { |
759 | unsigned long excess; | 759 | unsigned long excess; |
760 | struct mem_cgroup_per_zone *mz; | 760 | struct mem_cgroup_per_zone *mz; |
761 | struct mem_cgroup_tree_per_zone *mctz; | 761 | struct mem_cgroup_tree_per_zone *mctz; |
762 | 762 | ||
763 | mctz = soft_limit_tree_from_page(page); | 763 | mctz = soft_limit_tree_from_page(page); |
764 | /* | 764 | /* |
765 | * Necessary to update all ancestors when hierarchy is used. | 765 | * Necessary to update all ancestors when hierarchy is used. |
766 | * because their event counter is not touched. | 766 | * because their event counter is not touched. |
767 | */ | 767 | */ |
768 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { | 768 | for (; memcg; memcg = parent_mem_cgroup(memcg)) { |
769 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 769 | mz = mem_cgroup_page_zoneinfo(memcg, page); |
770 | excess = soft_limit_excess(memcg); | 770 | excess = soft_limit_excess(memcg); |
771 | /* | 771 | /* |
772 | * We have to update the tree if mz is on RB-tree or | 772 | * We have to update the tree if mz is on RB-tree or |
773 | * mem is over its softlimit. | 773 | * mem is over its softlimit. |
774 | */ | 774 | */ |
775 | if (excess || mz->on_tree) { | 775 | if (excess || mz->on_tree) { |
776 | unsigned long flags; | 776 | unsigned long flags; |
777 | 777 | ||
778 | spin_lock_irqsave(&mctz->lock, flags); | 778 | spin_lock_irqsave(&mctz->lock, flags); |
779 | /* if on-tree, remove it */ | 779 | /* if on-tree, remove it */ |
780 | if (mz->on_tree) | 780 | if (mz->on_tree) |
781 | __mem_cgroup_remove_exceeded(mz, mctz); | 781 | __mem_cgroup_remove_exceeded(mz, mctz); |
782 | /* | 782 | /* |
783 | * Insert again. mz->usage_in_excess will be updated. | 783 | * Insert again. mz->usage_in_excess will be updated. |
784 | * If excess is 0, no tree ops. | 784 | * If excess is 0, no tree ops. |
785 | */ | 785 | */ |
786 | __mem_cgroup_insert_exceeded(mz, mctz, excess); | 786 | __mem_cgroup_insert_exceeded(mz, mctz, excess); |
787 | spin_unlock_irqrestore(&mctz->lock, flags); | 787 | spin_unlock_irqrestore(&mctz->lock, flags); |
788 | } | 788 | } |
789 | } | 789 | } |
790 | } | 790 | } |
791 | 791 | ||
792 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) | 792 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) |
793 | { | 793 | { |
794 | struct mem_cgroup_tree_per_zone *mctz; | 794 | struct mem_cgroup_tree_per_zone *mctz; |
795 | struct mem_cgroup_per_zone *mz; | 795 | struct mem_cgroup_per_zone *mz; |
796 | int nid, zid; | 796 | int nid, zid; |
797 | 797 | ||
798 | for_each_node(nid) { | 798 | for_each_node(nid) { |
799 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 799 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
800 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; | 800 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; |
801 | mctz = soft_limit_tree_node_zone(nid, zid); | 801 | mctz = soft_limit_tree_node_zone(nid, zid); |
802 | mem_cgroup_remove_exceeded(mz, mctz); | 802 | mem_cgroup_remove_exceeded(mz, mctz); |
803 | } | 803 | } |
804 | } | 804 | } |
805 | } | 805 | } |
806 | 806 | ||
807 | static struct mem_cgroup_per_zone * | 807 | static struct mem_cgroup_per_zone * |
808 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 808 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) |
809 | { | 809 | { |
810 | struct rb_node *rightmost = NULL; | 810 | struct rb_node *rightmost = NULL; |
811 | struct mem_cgroup_per_zone *mz; | 811 | struct mem_cgroup_per_zone *mz; |
812 | 812 | ||
813 | retry: | 813 | retry: |
814 | mz = NULL; | 814 | mz = NULL; |
815 | rightmost = rb_last(&mctz->rb_root); | 815 | rightmost = rb_last(&mctz->rb_root); |
816 | if (!rightmost) | 816 | if (!rightmost) |
817 | goto done; /* Nothing to reclaim from */ | 817 | goto done; /* Nothing to reclaim from */ |
818 | 818 | ||
819 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | 819 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); |
820 | /* | 820 | /* |
821 | * Remove the node now but someone else can add it back, | 821 | * Remove the node now but someone else can add it back, |
822 | * we will to add it back at the end of reclaim to its correct | 822 | * we will to add it back at the end of reclaim to its correct |
823 | * position in the tree. | 823 | * position in the tree. |
824 | */ | 824 | */ |
825 | __mem_cgroup_remove_exceeded(mz, mctz); | 825 | __mem_cgroup_remove_exceeded(mz, mctz); |
826 | if (!soft_limit_excess(mz->memcg) || | 826 | if (!soft_limit_excess(mz->memcg) || |
827 | !css_tryget_online(&mz->memcg->css)) | 827 | !css_tryget_online(&mz->memcg->css)) |
828 | goto retry; | 828 | goto retry; |
829 | done: | 829 | done: |
830 | return mz; | 830 | return mz; |
831 | } | 831 | } |
832 | 832 | ||
833 | static struct mem_cgroup_per_zone * | 833 | static struct mem_cgroup_per_zone * |
834 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | 834 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) |
835 | { | 835 | { |
836 | struct mem_cgroup_per_zone *mz; | 836 | struct mem_cgroup_per_zone *mz; |
837 | 837 | ||
838 | spin_lock_irq(&mctz->lock); | 838 | spin_lock_irq(&mctz->lock); |
839 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | 839 | mz = __mem_cgroup_largest_soft_limit_node(mctz); |
840 | spin_unlock_irq(&mctz->lock); | 840 | spin_unlock_irq(&mctz->lock); |
841 | return mz; | 841 | return mz; |
842 | } | 842 | } |
843 | 843 | ||
844 | /* | 844 | /* |
845 | * Implementation Note: reading percpu statistics for memcg. | 845 | * Implementation Note: reading percpu statistics for memcg. |
846 | * | 846 | * |
847 | * Both of vmstat[] and percpu_counter has threshold and do periodic | 847 | * Both of vmstat[] and percpu_counter has threshold and do periodic |
848 | * synchronization to implement "quick" read. There are trade-off between | 848 | * synchronization to implement "quick" read. There are trade-off between |
849 | * reading cost and precision of value. Then, we may have a chance to implement | 849 | * reading cost and precision of value. Then, we may have a chance to implement |
850 | * a periodic synchronizion of counter in memcg's counter. | 850 | * a periodic synchronizion of counter in memcg's counter. |
851 | * | 851 | * |
852 | * But this _read() function is used for user interface now. The user accounts | 852 | * But this _read() function is used for user interface now. The user accounts |
853 | * memory usage by memory cgroup and he _always_ requires exact value because | 853 | * memory usage by memory cgroup and he _always_ requires exact value because |
854 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always | 854 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always |
855 | * have to visit all online cpus and make sum. So, for now, unnecessary | 855 | * have to visit all online cpus and make sum. So, for now, unnecessary |
856 | * synchronization is not implemented. (just implemented for cpu hotplug) | 856 | * synchronization is not implemented. (just implemented for cpu hotplug) |
857 | * | 857 | * |
858 | * If there are kernel internal actions which can make use of some not-exact | 858 | * If there are kernel internal actions which can make use of some not-exact |
859 | * value, and reading all cpu value can be performance bottleneck in some | 859 | * value, and reading all cpu value can be performance bottleneck in some |
860 | * common workload, threashold and synchonization as vmstat[] should be | 860 | * common workload, threashold and synchonization as vmstat[] should be |
861 | * implemented. | 861 | * implemented. |
862 | */ | 862 | */ |
863 | static long mem_cgroup_read_stat(struct mem_cgroup *memcg, | 863 | static long mem_cgroup_read_stat(struct mem_cgroup *memcg, |
864 | enum mem_cgroup_stat_index idx) | 864 | enum mem_cgroup_stat_index idx) |
865 | { | 865 | { |
866 | long val = 0; | 866 | long val = 0; |
867 | int cpu; | 867 | int cpu; |
868 | 868 | ||
869 | get_online_cpus(); | 869 | get_online_cpus(); |
870 | for_each_online_cpu(cpu) | 870 | for_each_online_cpu(cpu) |
871 | val += per_cpu(memcg->stat->count[idx], cpu); | 871 | val += per_cpu(memcg->stat->count[idx], cpu); |
872 | #ifdef CONFIG_HOTPLUG_CPU | 872 | #ifdef CONFIG_HOTPLUG_CPU |
873 | spin_lock(&memcg->pcp_counter_lock); | 873 | spin_lock(&memcg->pcp_counter_lock); |
874 | val += memcg->nocpu_base.count[idx]; | 874 | val += memcg->nocpu_base.count[idx]; |
875 | spin_unlock(&memcg->pcp_counter_lock); | 875 | spin_unlock(&memcg->pcp_counter_lock); |
876 | #endif | 876 | #endif |
877 | put_online_cpus(); | 877 | put_online_cpus(); |
878 | return val; | 878 | return val; |
879 | } | 879 | } |
880 | 880 | ||
881 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | 881 | static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, |
882 | enum mem_cgroup_events_index idx) | 882 | enum mem_cgroup_events_index idx) |
883 | { | 883 | { |
884 | unsigned long val = 0; | 884 | unsigned long val = 0; |
885 | int cpu; | 885 | int cpu; |
886 | 886 | ||
887 | get_online_cpus(); | 887 | get_online_cpus(); |
888 | for_each_online_cpu(cpu) | 888 | for_each_online_cpu(cpu) |
889 | val += per_cpu(memcg->stat->events[idx], cpu); | 889 | val += per_cpu(memcg->stat->events[idx], cpu); |
890 | #ifdef CONFIG_HOTPLUG_CPU | 890 | #ifdef CONFIG_HOTPLUG_CPU |
891 | spin_lock(&memcg->pcp_counter_lock); | 891 | spin_lock(&memcg->pcp_counter_lock); |
892 | val += memcg->nocpu_base.events[idx]; | 892 | val += memcg->nocpu_base.events[idx]; |
893 | spin_unlock(&memcg->pcp_counter_lock); | 893 | spin_unlock(&memcg->pcp_counter_lock); |
894 | #endif | 894 | #endif |
895 | put_online_cpus(); | 895 | put_online_cpus(); |
896 | return val; | 896 | return val; |
897 | } | 897 | } |
898 | 898 | ||
899 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 899 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
900 | struct page *page, | 900 | struct page *page, |
901 | int nr_pages) | 901 | int nr_pages) |
902 | { | 902 | { |
903 | /* | 903 | /* |
904 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is | 904 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is |
905 | * counted as CACHE even if it's on ANON LRU. | 905 | * counted as CACHE even if it's on ANON LRU. |
906 | */ | 906 | */ |
907 | if (PageAnon(page)) | 907 | if (PageAnon(page)) |
908 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], | 908 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], |
909 | nr_pages); | 909 | nr_pages); |
910 | else | 910 | else |
911 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 911 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
912 | nr_pages); | 912 | nr_pages); |
913 | 913 | ||
914 | if (PageTransHuge(page)) | 914 | if (PageTransHuge(page)) |
915 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | 915 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], |
916 | nr_pages); | 916 | nr_pages); |
917 | 917 | ||
918 | /* pagein of a big page is an event. So, ignore page size */ | 918 | /* pagein of a big page is an event. So, ignore page size */ |
919 | if (nr_pages > 0) | 919 | if (nr_pages > 0) |
920 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); | 920 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); |
921 | else { | 921 | else { |
922 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); | 922 | __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); |
923 | nr_pages = -nr_pages; /* for event */ | 923 | nr_pages = -nr_pages; /* for event */ |
924 | } | 924 | } |
925 | 925 | ||
926 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); | 926 | __this_cpu_add(memcg->stat->nr_page_events, nr_pages); |
927 | } | 927 | } |
928 | 928 | ||
929 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) | 929 | unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) |
930 | { | 930 | { |
931 | struct mem_cgroup_per_zone *mz; | 931 | struct mem_cgroup_per_zone *mz; |
932 | 932 | ||
933 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | 933 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); |
934 | return mz->lru_size[lru]; | 934 | return mz->lru_size[lru]; |
935 | } | 935 | } |
936 | 936 | ||
937 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, | 937 | static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, |
938 | int nid, | 938 | int nid, |
939 | unsigned int lru_mask) | 939 | unsigned int lru_mask) |
940 | { | 940 | { |
941 | unsigned long nr = 0; | 941 | unsigned long nr = 0; |
942 | int zid; | 942 | int zid; |
943 | 943 | ||
944 | VM_BUG_ON((unsigned)nid >= nr_node_ids); | 944 | VM_BUG_ON((unsigned)nid >= nr_node_ids); |
945 | 945 | ||
946 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 946 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
947 | struct mem_cgroup_per_zone *mz; | 947 | struct mem_cgroup_per_zone *mz; |
948 | enum lru_list lru; | 948 | enum lru_list lru; |
949 | 949 | ||
950 | for_each_lru(lru) { | 950 | for_each_lru(lru) { |
951 | if (!(BIT(lru) & lru_mask)) | 951 | if (!(BIT(lru) & lru_mask)) |
952 | continue; | 952 | continue; |
953 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; | 953 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; |
954 | nr += mz->lru_size[lru]; | 954 | nr += mz->lru_size[lru]; |
955 | } | 955 | } |
956 | } | 956 | } |
957 | return nr; | 957 | return nr; |
958 | } | 958 | } |
959 | 959 | ||
960 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | 960 | static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, |
961 | unsigned int lru_mask) | 961 | unsigned int lru_mask) |
962 | { | 962 | { |
963 | unsigned long nr = 0; | 963 | unsigned long nr = 0; |
964 | int nid; | 964 | int nid; |
965 | 965 | ||
966 | for_each_node_state(nid, N_MEMORY) | 966 | for_each_node_state(nid, N_MEMORY) |
967 | nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); | 967 | nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); |
968 | return nr; | 968 | return nr; |
969 | } | 969 | } |
970 | 970 | ||
971 | static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, | 971 | static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, |
972 | enum mem_cgroup_events_target target) | 972 | enum mem_cgroup_events_target target) |
973 | { | 973 | { |
974 | unsigned long val, next; | 974 | unsigned long val, next; |
975 | 975 | ||
976 | val = __this_cpu_read(memcg->stat->nr_page_events); | 976 | val = __this_cpu_read(memcg->stat->nr_page_events); |
977 | next = __this_cpu_read(memcg->stat->targets[target]); | 977 | next = __this_cpu_read(memcg->stat->targets[target]); |
978 | /* from time_after() in jiffies.h */ | 978 | /* from time_after() in jiffies.h */ |
979 | if ((long)next - (long)val < 0) { | 979 | if ((long)next - (long)val < 0) { |
980 | switch (target) { | 980 | switch (target) { |
981 | case MEM_CGROUP_TARGET_THRESH: | 981 | case MEM_CGROUP_TARGET_THRESH: |
982 | next = val + THRESHOLDS_EVENTS_TARGET; | 982 | next = val + THRESHOLDS_EVENTS_TARGET; |
983 | break; | 983 | break; |
984 | case MEM_CGROUP_TARGET_SOFTLIMIT: | 984 | case MEM_CGROUP_TARGET_SOFTLIMIT: |
985 | next = val + SOFTLIMIT_EVENTS_TARGET; | 985 | next = val + SOFTLIMIT_EVENTS_TARGET; |
986 | break; | 986 | break; |
987 | case MEM_CGROUP_TARGET_NUMAINFO: | 987 | case MEM_CGROUP_TARGET_NUMAINFO: |
988 | next = val + NUMAINFO_EVENTS_TARGET; | 988 | next = val + NUMAINFO_EVENTS_TARGET; |
989 | break; | 989 | break; |
990 | default: | 990 | default: |
991 | break; | 991 | break; |
992 | } | 992 | } |
993 | __this_cpu_write(memcg->stat->targets[target], next); | 993 | __this_cpu_write(memcg->stat->targets[target], next); |
994 | return true; | 994 | return true; |
995 | } | 995 | } |
996 | return false; | 996 | return false; |
997 | } | 997 | } |
998 | 998 | ||
999 | /* | 999 | /* |
1000 | * Check events in order. | 1000 | * Check events in order. |
1001 | * | 1001 | * |
1002 | */ | 1002 | */ |
1003 | static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) | 1003 | static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) |
1004 | { | 1004 | { |
1005 | /* threshold event is triggered in finer grain than soft limit */ | 1005 | /* threshold event is triggered in finer grain than soft limit */ |
1006 | if (unlikely(mem_cgroup_event_ratelimit(memcg, | 1006 | if (unlikely(mem_cgroup_event_ratelimit(memcg, |
1007 | MEM_CGROUP_TARGET_THRESH))) { | 1007 | MEM_CGROUP_TARGET_THRESH))) { |
1008 | bool do_softlimit; | 1008 | bool do_softlimit; |
1009 | bool do_numainfo __maybe_unused; | 1009 | bool do_numainfo __maybe_unused; |
1010 | 1010 | ||
1011 | do_softlimit = mem_cgroup_event_ratelimit(memcg, | 1011 | do_softlimit = mem_cgroup_event_ratelimit(memcg, |
1012 | MEM_CGROUP_TARGET_SOFTLIMIT); | 1012 | MEM_CGROUP_TARGET_SOFTLIMIT); |
1013 | #if MAX_NUMNODES > 1 | 1013 | #if MAX_NUMNODES > 1 |
1014 | do_numainfo = mem_cgroup_event_ratelimit(memcg, | 1014 | do_numainfo = mem_cgroup_event_ratelimit(memcg, |
1015 | MEM_CGROUP_TARGET_NUMAINFO); | 1015 | MEM_CGROUP_TARGET_NUMAINFO); |
1016 | #endif | 1016 | #endif |
1017 | mem_cgroup_threshold(memcg); | 1017 | mem_cgroup_threshold(memcg); |
1018 | if (unlikely(do_softlimit)) | 1018 | if (unlikely(do_softlimit)) |
1019 | mem_cgroup_update_tree(memcg, page); | 1019 | mem_cgroup_update_tree(memcg, page); |
1020 | #if MAX_NUMNODES > 1 | 1020 | #if MAX_NUMNODES > 1 |
1021 | if (unlikely(do_numainfo)) | 1021 | if (unlikely(do_numainfo)) |
1022 | atomic_inc(&memcg->numainfo_events); | 1022 | atomic_inc(&memcg->numainfo_events); |
1023 | #endif | 1023 | #endif |
1024 | } | 1024 | } |
1025 | } | 1025 | } |
1026 | 1026 | ||
1027 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | 1027 | struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) |
1028 | { | 1028 | { |
1029 | /* | 1029 | /* |
1030 | * mm_update_next_owner() may clear mm->owner to NULL | 1030 | * mm_update_next_owner() may clear mm->owner to NULL |
1031 | * if it races with swapoff, page migration, etc. | 1031 | * if it races with swapoff, page migration, etc. |
1032 | * So this can be called with p == NULL. | 1032 | * So this can be called with p == NULL. |
1033 | */ | 1033 | */ |
1034 | if (unlikely(!p)) | 1034 | if (unlikely(!p)) |
1035 | return NULL; | 1035 | return NULL; |
1036 | 1036 | ||
1037 | return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); | 1037 | return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); |
1038 | } | 1038 | } |
1039 | 1039 | ||
1040 | static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) | 1040 | static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) |
1041 | { | 1041 | { |
1042 | struct mem_cgroup *memcg = NULL; | 1042 | struct mem_cgroup *memcg = NULL; |
1043 | 1043 | ||
1044 | rcu_read_lock(); | 1044 | rcu_read_lock(); |
1045 | do { | 1045 | do { |
1046 | /* | 1046 | /* |
1047 | * Page cache insertions can happen withou an | 1047 | * Page cache insertions can happen withou an |
1048 | * actual mm context, e.g. during disk probing | 1048 | * actual mm context, e.g. during disk probing |
1049 | * on boot, loopback IO, acct() writes etc. | 1049 | * on boot, loopback IO, acct() writes etc. |
1050 | */ | 1050 | */ |
1051 | if (unlikely(!mm)) | 1051 | if (unlikely(!mm)) |
1052 | memcg = root_mem_cgroup; | 1052 | memcg = root_mem_cgroup; |
1053 | else { | 1053 | else { |
1054 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1054 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
1055 | if (unlikely(!memcg)) | 1055 | if (unlikely(!memcg)) |
1056 | memcg = root_mem_cgroup; | 1056 | memcg = root_mem_cgroup; |
1057 | } | 1057 | } |
1058 | } while (!css_tryget_online(&memcg->css)); | 1058 | } while (!css_tryget_online(&memcg->css)); |
1059 | rcu_read_unlock(); | 1059 | rcu_read_unlock(); |
1060 | return memcg; | 1060 | return memcg; |
1061 | } | 1061 | } |
1062 | 1062 | ||
1063 | /** | 1063 | /** |
1064 | * mem_cgroup_iter - iterate over memory cgroup hierarchy | 1064 | * mem_cgroup_iter - iterate over memory cgroup hierarchy |
1065 | * @root: hierarchy root | 1065 | * @root: hierarchy root |
1066 | * @prev: previously returned memcg, NULL on first invocation | 1066 | * @prev: previously returned memcg, NULL on first invocation |
1067 | * @reclaim: cookie for shared reclaim walks, NULL for full walks | 1067 | * @reclaim: cookie for shared reclaim walks, NULL for full walks |
1068 | * | 1068 | * |
1069 | * Returns references to children of the hierarchy below @root, or | 1069 | * Returns references to children of the hierarchy below @root, or |
1070 | * @root itself, or %NULL after a full round-trip. | 1070 | * @root itself, or %NULL after a full round-trip. |
1071 | * | 1071 | * |
1072 | * Caller must pass the return value in @prev on subsequent | 1072 | * Caller must pass the return value in @prev on subsequent |
1073 | * invocations for reference counting, or use mem_cgroup_iter_break() | 1073 | * invocations for reference counting, or use mem_cgroup_iter_break() |
1074 | * to cancel a hierarchy walk before the round-trip is complete. | 1074 | * to cancel a hierarchy walk before the round-trip is complete. |
1075 | * | 1075 | * |
1076 | * Reclaimers can specify a zone and a priority level in @reclaim to | 1076 | * Reclaimers can specify a zone and a priority level in @reclaim to |
1077 | * divide up the memcgs in the hierarchy among all concurrent | 1077 | * divide up the memcgs in the hierarchy among all concurrent |
1078 | * reclaimers operating on the same zone and priority. | 1078 | * reclaimers operating on the same zone and priority. |
1079 | */ | 1079 | */ |
1080 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | 1080 | struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, |
1081 | struct mem_cgroup *prev, | 1081 | struct mem_cgroup *prev, |
1082 | struct mem_cgroup_reclaim_cookie *reclaim) | 1082 | struct mem_cgroup_reclaim_cookie *reclaim) |
1083 | { | 1083 | { |
1084 | struct reclaim_iter *uninitialized_var(iter); | 1084 | struct reclaim_iter *uninitialized_var(iter); |
1085 | struct cgroup_subsys_state *css = NULL; | 1085 | struct cgroup_subsys_state *css = NULL; |
1086 | struct mem_cgroup *memcg = NULL; | 1086 | struct mem_cgroup *memcg = NULL; |
1087 | struct mem_cgroup *pos = NULL; | 1087 | struct mem_cgroup *pos = NULL; |
1088 | 1088 | ||
1089 | if (mem_cgroup_disabled()) | 1089 | if (mem_cgroup_disabled()) |
1090 | return NULL; | 1090 | return NULL; |
1091 | 1091 | ||
1092 | if (!root) | 1092 | if (!root) |
1093 | root = root_mem_cgroup; | 1093 | root = root_mem_cgroup; |
1094 | 1094 | ||
1095 | if (prev && !reclaim) | 1095 | if (prev && !reclaim) |
1096 | pos = prev; | 1096 | pos = prev; |
1097 | 1097 | ||
1098 | if (!root->use_hierarchy && root != root_mem_cgroup) { | 1098 | if (!root->use_hierarchy && root != root_mem_cgroup) { |
1099 | if (prev) | 1099 | if (prev) |
1100 | goto out; | 1100 | goto out; |
1101 | return root; | 1101 | return root; |
1102 | } | 1102 | } |
1103 | 1103 | ||
1104 | rcu_read_lock(); | 1104 | rcu_read_lock(); |
1105 | 1105 | ||
1106 | if (reclaim) { | 1106 | if (reclaim) { |
1107 | struct mem_cgroup_per_zone *mz; | 1107 | struct mem_cgroup_per_zone *mz; |
1108 | 1108 | ||
1109 | mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); | 1109 | mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); |
1110 | iter = &mz->iter[reclaim->priority]; | 1110 | iter = &mz->iter[reclaim->priority]; |
1111 | 1111 | ||
1112 | if (prev && reclaim->generation != iter->generation) | 1112 | if (prev && reclaim->generation != iter->generation) |
1113 | goto out_unlock; | 1113 | goto out_unlock; |
1114 | 1114 | ||
1115 | do { | 1115 | do { |
1116 | pos = ACCESS_ONCE(iter->position); | 1116 | pos = ACCESS_ONCE(iter->position); |
1117 | /* | 1117 | /* |
1118 | * A racing update may change the position and | 1118 | * A racing update may change the position and |
1119 | * put the last reference, hence css_tryget(), | 1119 | * put the last reference, hence css_tryget(), |
1120 | * or retry to see the updated position. | 1120 | * or retry to see the updated position. |
1121 | */ | 1121 | */ |
1122 | } while (pos && !css_tryget(&pos->css)); | 1122 | } while (pos && !css_tryget(&pos->css)); |
1123 | } | 1123 | } |
1124 | 1124 | ||
1125 | if (pos) | 1125 | if (pos) |
1126 | css = &pos->css; | 1126 | css = &pos->css; |
1127 | 1127 | ||
1128 | for (;;) { | 1128 | for (;;) { |
1129 | css = css_next_descendant_pre(css, &root->css); | 1129 | css = css_next_descendant_pre(css, &root->css); |
1130 | if (!css) { | 1130 | if (!css) { |
1131 | /* | 1131 | /* |
1132 | * Reclaimers share the hierarchy walk, and a | 1132 | * Reclaimers share the hierarchy walk, and a |
1133 | * new one might jump in right at the end of | 1133 | * new one might jump in right at the end of |
1134 | * the hierarchy - make sure they see at least | 1134 | * the hierarchy - make sure they see at least |
1135 | * one group and restart from the beginning. | 1135 | * one group and restart from the beginning. |
1136 | */ | 1136 | */ |
1137 | if (!prev) | 1137 | if (!prev) |
1138 | continue; | 1138 | continue; |
1139 | break; | 1139 | break; |
1140 | } | 1140 | } |
1141 | 1141 | ||
1142 | /* | 1142 | /* |
1143 | * Verify the css and acquire a reference. The root | 1143 | * Verify the css and acquire a reference. The root |
1144 | * is provided by the caller, so we know it's alive | 1144 | * is provided by the caller, so we know it's alive |
1145 | * and kicking, and don't take an extra reference. | 1145 | * and kicking, and don't take an extra reference. |
1146 | */ | 1146 | */ |
1147 | memcg = mem_cgroup_from_css(css); | 1147 | memcg = mem_cgroup_from_css(css); |
1148 | 1148 | ||
1149 | if (css == &root->css) | 1149 | if (css == &root->css) |
1150 | break; | 1150 | break; |
1151 | 1151 | ||
1152 | if (css_tryget_online(css)) { | 1152 | if (css_tryget_online(css)) { |
1153 | /* | 1153 | /* |
1154 | * Make sure the memcg is initialized: | 1154 | * Make sure the memcg is initialized: |
1155 | * mem_cgroup_css_online() orders the the | 1155 | * mem_cgroup_css_online() orders the the |
1156 | * initialization against setting the flag. | 1156 | * initialization against setting the flag. |
1157 | */ | 1157 | */ |
1158 | if (smp_load_acquire(&memcg->initialized)) | 1158 | if (smp_load_acquire(&memcg->initialized)) |
1159 | break; | 1159 | break; |
1160 | 1160 | ||
1161 | css_put(css); | 1161 | css_put(css); |
1162 | } | 1162 | } |
1163 | 1163 | ||
1164 | memcg = NULL; | 1164 | memcg = NULL; |
1165 | } | 1165 | } |
1166 | 1166 | ||
1167 | if (reclaim) { | 1167 | if (reclaim) { |
1168 | if (cmpxchg(&iter->position, pos, memcg) == pos) { | 1168 | if (cmpxchg(&iter->position, pos, memcg) == pos) { |
1169 | if (memcg) | 1169 | if (memcg) |
1170 | css_get(&memcg->css); | 1170 | css_get(&memcg->css); |
1171 | if (pos) | 1171 | if (pos) |
1172 | css_put(&pos->css); | 1172 | css_put(&pos->css); |
1173 | } | 1173 | } |
1174 | 1174 | ||
1175 | /* | 1175 | /* |
1176 | * pairs with css_tryget when dereferencing iter->position | 1176 | * pairs with css_tryget when dereferencing iter->position |
1177 | * above. | 1177 | * above. |
1178 | */ | 1178 | */ |
1179 | if (pos) | 1179 | if (pos) |
1180 | css_put(&pos->css); | 1180 | css_put(&pos->css); |
1181 | 1181 | ||
1182 | if (!memcg) | 1182 | if (!memcg) |
1183 | iter->generation++; | 1183 | iter->generation++; |
1184 | else if (!prev) | 1184 | else if (!prev) |
1185 | reclaim->generation = iter->generation; | 1185 | reclaim->generation = iter->generation; |
1186 | } | 1186 | } |
1187 | 1187 | ||
1188 | out_unlock: | 1188 | out_unlock: |
1189 | rcu_read_unlock(); | 1189 | rcu_read_unlock(); |
1190 | out: | 1190 | out: |
1191 | if (prev && prev != root) | 1191 | if (prev && prev != root) |
1192 | css_put(&prev->css); | 1192 | css_put(&prev->css); |
1193 | 1193 | ||
1194 | return memcg; | 1194 | return memcg; |
1195 | } | 1195 | } |
1196 | 1196 | ||
1197 | /** | 1197 | /** |
1198 | * mem_cgroup_iter_break - abort a hierarchy walk prematurely | 1198 | * mem_cgroup_iter_break - abort a hierarchy walk prematurely |
1199 | * @root: hierarchy root | 1199 | * @root: hierarchy root |
1200 | * @prev: last visited hierarchy member as returned by mem_cgroup_iter() | 1200 | * @prev: last visited hierarchy member as returned by mem_cgroup_iter() |
1201 | */ | 1201 | */ |
1202 | void mem_cgroup_iter_break(struct mem_cgroup *root, | 1202 | void mem_cgroup_iter_break(struct mem_cgroup *root, |
1203 | struct mem_cgroup *prev) | 1203 | struct mem_cgroup *prev) |
1204 | { | 1204 | { |
1205 | if (!root) | 1205 | if (!root) |
1206 | root = root_mem_cgroup; | 1206 | root = root_mem_cgroup; |
1207 | if (prev && prev != root) | 1207 | if (prev && prev != root) |
1208 | css_put(&prev->css); | 1208 | css_put(&prev->css); |
1209 | } | 1209 | } |
1210 | 1210 | ||
1211 | /* | 1211 | /* |
1212 | * Iteration constructs for visiting all cgroups (under a tree). If | 1212 | * Iteration constructs for visiting all cgroups (under a tree). If |
1213 | * loops are exited prematurely (break), mem_cgroup_iter_break() must | 1213 | * loops are exited prematurely (break), mem_cgroup_iter_break() must |
1214 | * be used for reference counting. | 1214 | * be used for reference counting. |
1215 | */ | 1215 | */ |
1216 | #define for_each_mem_cgroup_tree(iter, root) \ | 1216 | #define for_each_mem_cgroup_tree(iter, root) \ |
1217 | for (iter = mem_cgroup_iter(root, NULL, NULL); \ | 1217 | for (iter = mem_cgroup_iter(root, NULL, NULL); \ |
1218 | iter != NULL; \ | 1218 | iter != NULL; \ |
1219 | iter = mem_cgroup_iter(root, iter, NULL)) | 1219 | iter = mem_cgroup_iter(root, iter, NULL)) |
1220 | 1220 | ||
1221 | #define for_each_mem_cgroup(iter) \ | 1221 | #define for_each_mem_cgroup(iter) \ |
1222 | for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ | 1222 | for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ |
1223 | iter != NULL; \ | 1223 | iter != NULL; \ |
1224 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1224 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
1225 | 1225 | ||
1226 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1226 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
1227 | { | 1227 | { |
1228 | struct mem_cgroup *memcg; | 1228 | struct mem_cgroup *memcg; |
1229 | 1229 | ||
1230 | rcu_read_lock(); | 1230 | rcu_read_lock(); |
1231 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1231 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
1232 | if (unlikely(!memcg)) | 1232 | if (unlikely(!memcg)) |
1233 | goto out; | 1233 | goto out; |
1234 | 1234 | ||
1235 | switch (idx) { | 1235 | switch (idx) { |
1236 | case PGFAULT: | 1236 | case PGFAULT: |
1237 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); | 1237 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); |
1238 | break; | 1238 | break; |
1239 | case PGMAJFAULT: | 1239 | case PGMAJFAULT: |
1240 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); | 1240 | this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); |
1241 | break; | 1241 | break; |
1242 | default: | 1242 | default: |
1243 | BUG(); | 1243 | BUG(); |
1244 | } | 1244 | } |
1245 | out: | 1245 | out: |
1246 | rcu_read_unlock(); | 1246 | rcu_read_unlock(); |
1247 | } | 1247 | } |
1248 | EXPORT_SYMBOL(__mem_cgroup_count_vm_event); | 1248 | EXPORT_SYMBOL(__mem_cgroup_count_vm_event); |
1249 | 1249 | ||
1250 | /** | 1250 | /** |
1251 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 1251 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
1252 | * @zone: zone of the wanted lruvec | 1252 | * @zone: zone of the wanted lruvec |
1253 | * @memcg: memcg of the wanted lruvec | 1253 | * @memcg: memcg of the wanted lruvec |
1254 | * | 1254 | * |
1255 | * Returns the lru list vector holding pages for the given @zone and | 1255 | * Returns the lru list vector holding pages for the given @zone and |
1256 | * @mem. This can be the global zone lruvec, if the memory controller | 1256 | * @mem. This can be the global zone lruvec, if the memory controller |
1257 | * is disabled. | 1257 | * is disabled. |
1258 | */ | 1258 | */ |
1259 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, | 1259 | struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, |
1260 | struct mem_cgroup *memcg) | 1260 | struct mem_cgroup *memcg) |
1261 | { | 1261 | { |
1262 | struct mem_cgroup_per_zone *mz; | 1262 | struct mem_cgroup_per_zone *mz; |
1263 | struct lruvec *lruvec; | 1263 | struct lruvec *lruvec; |
1264 | 1264 | ||
1265 | if (mem_cgroup_disabled()) { | 1265 | if (mem_cgroup_disabled()) { |
1266 | lruvec = &zone->lruvec; | 1266 | lruvec = &zone->lruvec; |
1267 | goto out; | 1267 | goto out; |
1268 | } | 1268 | } |
1269 | 1269 | ||
1270 | mz = mem_cgroup_zone_zoneinfo(memcg, zone); | 1270 | mz = mem_cgroup_zone_zoneinfo(memcg, zone); |
1271 | lruvec = &mz->lruvec; | 1271 | lruvec = &mz->lruvec; |
1272 | out: | 1272 | out: |
1273 | /* | 1273 | /* |
1274 | * Since a node can be onlined after the mem_cgroup was created, | 1274 | * Since a node can be onlined after the mem_cgroup was created, |
1275 | * we have to be prepared to initialize lruvec->zone here; | 1275 | * we have to be prepared to initialize lruvec->zone here; |
1276 | * and if offlined then reonlined, we need to reinitialize it. | 1276 | * and if offlined then reonlined, we need to reinitialize it. |
1277 | */ | 1277 | */ |
1278 | if (unlikely(lruvec->zone != zone)) | 1278 | if (unlikely(lruvec->zone != zone)) |
1279 | lruvec->zone = zone; | 1279 | lruvec->zone = zone; |
1280 | return lruvec; | 1280 | return lruvec; |
1281 | } | 1281 | } |
1282 | 1282 | ||
1283 | /** | 1283 | /** |
1284 | * mem_cgroup_page_lruvec - return lruvec for adding an lru page | 1284 | * mem_cgroup_page_lruvec - return lruvec for adding an lru page |
1285 | * @page: the page | 1285 | * @page: the page |
1286 | * @zone: zone of the page | 1286 | * @zone: zone of the page |
1287 | */ | 1287 | */ |
1288 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) | 1288 | struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) |
1289 | { | 1289 | { |
1290 | struct mem_cgroup_per_zone *mz; | 1290 | struct mem_cgroup_per_zone *mz; |
1291 | struct mem_cgroup *memcg; | 1291 | struct mem_cgroup *memcg; |
1292 | struct page_cgroup *pc; | 1292 | struct page_cgroup *pc; |
1293 | struct lruvec *lruvec; | 1293 | struct lruvec *lruvec; |
1294 | 1294 | ||
1295 | if (mem_cgroup_disabled()) { | 1295 | if (mem_cgroup_disabled()) { |
1296 | lruvec = &zone->lruvec; | 1296 | lruvec = &zone->lruvec; |
1297 | goto out; | 1297 | goto out; |
1298 | } | 1298 | } |
1299 | 1299 | ||
1300 | pc = lookup_page_cgroup(page); | 1300 | pc = lookup_page_cgroup(page); |
1301 | memcg = pc->mem_cgroup; | 1301 | memcg = pc->mem_cgroup; |
1302 | 1302 | ||
1303 | /* | 1303 | /* |
1304 | * Surreptitiously switch any uncharged offlist page to root: | 1304 | * Surreptitiously switch any uncharged offlist page to root: |
1305 | * an uncharged page off lru does nothing to secure | 1305 | * an uncharged page off lru does nothing to secure |
1306 | * its former mem_cgroup from sudden removal. | 1306 | * its former mem_cgroup from sudden removal. |
1307 | * | 1307 | * |
1308 | * Our caller holds lru_lock, and PageCgroupUsed is updated | 1308 | * Our caller holds lru_lock, and PageCgroupUsed is updated |
1309 | * under page_cgroup lock: between them, they make all uses | 1309 | * under page_cgroup lock: between them, they make all uses |
1310 | * of pc->mem_cgroup safe. | 1310 | * of pc->mem_cgroup safe. |
1311 | */ | 1311 | */ |
1312 | if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) | 1312 | if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) |
1313 | pc->mem_cgroup = memcg = root_mem_cgroup; | 1313 | pc->mem_cgroup = memcg = root_mem_cgroup; |
1314 | 1314 | ||
1315 | mz = mem_cgroup_page_zoneinfo(memcg, page); | 1315 | mz = mem_cgroup_page_zoneinfo(memcg, page); |
1316 | lruvec = &mz->lruvec; | 1316 | lruvec = &mz->lruvec; |
1317 | out: | 1317 | out: |
1318 | /* | 1318 | /* |
1319 | * Since a node can be onlined after the mem_cgroup was created, | 1319 | * Since a node can be onlined after the mem_cgroup was created, |
1320 | * we have to be prepared to initialize lruvec->zone here; | 1320 | * we have to be prepared to initialize lruvec->zone here; |
1321 | * and if offlined then reonlined, we need to reinitialize it. | 1321 | * and if offlined then reonlined, we need to reinitialize it. |
1322 | */ | 1322 | */ |
1323 | if (unlikely(lruvec->zone != zone)) | 1323 | if (unlikely(lruvec->zone != zone)) |
1324 | lruvec->zone = zone; | 1324 | lruvec->zone = zone; |
1325 | return lruvec; | 1325 | return lruvec; |
1326 | } | 1326 | } |
1327 | 1327 | ||
1328 | /** | 1328 | /** |
1329 | * mem_cgroup_update_lru_size - account for adding or removing an lru page | 1329 | * mem_cgroup_update_lru_size - account for adding or removing an lru page |
1330 | * @lruvec: mem_cgroup per zone lru vector | 1330 | * @lruvec: mem_cgroup per zone lru vector |
1331 | * @lru: index of lru list the page is sitting on | 1331 | * @lru: index of lru list the page is sitting on |
1332 | * @nr_pages: positive when adding or negative when removing | 1332 | * @nr_pages: positive when adding or negative when removing |
1333 | * | 1333 | * |
1334 | * This function must be called when a page is added to or removed from an | 1334 | * This function must be called when a page is added to or removed from an |
1335 | * lru list. | 1335 | * lru list. |
1336 | */ | 1336 | */ |
1337 | void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, | 1337 | void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, |
1338 | int nr_pages) | 1338 | int nr_pages) |
1339 | { | 1339 | { |
1340 | struct mem_cgroup_per_zone *mz; | 1340 | struct mem_cgroup_per_zone *mz; |
1341 | unsigned long *lru_size; | 1341 | unsigned long *lru_size; |
1342 | 1342 | ||
1343 | if (mem_cgroup_disabled()) | 1343 | if (mem_cgroup_disabled()) |
1344 | return; | 1344 | return; |
1345 | 1345 | ||
1346 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); | 1346 | mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); |
1347 | lru_size = mz->lru_size + lru; | 1347 | lru_size = mz->lru_size + lru; |
1348 | *lru_size += nr_pages; | 1348 | *lru_size += nr_pages; |
1349 | VM_BUG_ON((long)(*lru_size) < 0); | 1349 | VM_BUG_ON((long)(*lru_size) < 0); |
1350 | } | 1350 | } |
1351 | 1351 | ||
1352 | /* | 1352 | /* |
1353 | * Checks whether given mem is same or in the root_mem_cgroup's | 1353 | * Checks whether given mem is same or in the root_mem_cgroup's |
1354 | * hierarchy subtree | 1354 | * hierarchy subtree |
1355 | */ | 1355 | */ |
1356 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 1356 | bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
1357 | struct mem_cgroup *memcg) | 1357 | struct mem_cgroup *memcg) |
1358 | { | 1358 | { |
1359 | if (root_memcg == memcg) | 1359 | if (root_memcg == memcg) |
1360 | return true; | 1360 | return true; |
1361 | if (!root_memcg->use_hierarchy || !memcg) | 1361 | if (!root_memcg->use_hierarchy || !memcg) |
1362 | return false; | 1362 | return false; |
1363 | return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); | 1363 | return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); |
1364 | } | 1364 | } |
1365 | 1365 | ||
1366 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | 1366 | static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, |
1367 | struct mem_cgroup *memcg) | 1367 | struct mem_cgroup *memcg) |
1368 | { | 1368 | { |
1369 | bool ret; | 1369 | bool ret; |
1370 | 1370 | ||
1371 | rcu_read_lock(); | 1371 | rcu_read_lock(); |
1372 | ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); | 1372 | ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); |
1373 | rcu_read_unlock(); | 1373 | rcu_read_unlock(); |
1374 | return ret; | 1374 | return ret; |
1375 | } | 1375 | } |
1376 | 1376 | ||
1377 | bool task_in_mem_cgroup(struct task_struct *task, | 1377 | bool task_in_mem_cgroup(struct task_struct *task, |
1378 | const struct mem_cgroup *memcg) | 1378 | const struct mem_cgroup *memcg) |
1379 | { | 1379 | { |
1380 | struct mem_cgroup *curr = NULL; | 1380 | struct mem_cgroup *curr = NULL; |
1381 | struct task_struct *p; | 1381 | struct task_struct *p; |
1382 | bool ret; | 1382 | bool ret; |
1383 | 1383 | ||
1384 | p = find_lock_task_mm(task); | 1384 | p = find_lock_task_mm(task); |
1385 | if (p) { | 1385 | if (p) { |
1386 | curr = get_mem_cgroup_from_mm(p->mm); | 1386 | curr = get_mem_cgroup_from_mm(p->mm); |
1387 | task_unlock(p); | 1387 | task_unlock(p); |
1388 | } else { | 1388 | } else { |
1389 | /* | 1389 | /* |
1390 | * All threads may have already detached their mm's, but the oom | 1390 | * All threads may have already detached their mm's, but the oom |
1391 | * killer still needs to detect if they have already been oom | 1391 | * killer still needs to detect if they have already been oom |
1392 | * killed to prevent needlessly killing additional tasks. | 1392 | * killed to prevent needlessly killing additional tasks. |
1393 | */ | 1393 | */ |
1394 | rcu_read_lock(); | 1394 | rcu_read_lock(); |
1395 | curr = mem_cgroup_from_task(task); | 1395 | curr = mem_cgroup_from_task(task); |
1396 | if (curr) | 1396 | if (curr) |
1397 | css_get(&curr->css); | 1397 | css_get(&curr->css); |
1398 | rcu_read_unlock(); | 1398 | rcu_read_unlock(); |
1399 | } | 1399 | } |
1400 | /* | 1400 | /* |
1401 | * We should check use_hierarchy of "memcg" not "curr". Because checking | 1401 | * We should check use_hierarchy of "memcg" not "curr". Because checking |
1402 | * use_hierarchy of "curr" here make this function true if hierarchy is | 1402 | * use_hierarchy of "curr" here make this function true if hierarchy is |
1403 | * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* | 1403 | * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* |
1404 | * hierarchy(even if use_hierarchy is disabled in "memcg"). | 1404 | * hierarchy(even if use_hierarchy is disabled in "memcg"). |
1405 | */ | 1405 | */ |
1406 | ret = mem_cgroup_same_or_subtree(memcg, curr); | 1406 | ret = mem_cgroup_same_or_subtree(memcg, curr); |
1407 | css_put(&curr->css); | 1407 | css_put(&curr->css); |
1408 | return ret; | 1408 | return ret; |
1409 | } | 1409 | } |
1410 | 1410 | ||
1411 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) | 1411 | int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) |
1412 | { | 1412 | { |
1413 | unsigned long inactive_ratio; | 1413 | unsigned long inactive_ratio; |
1414 | unsigned long inactive; | 1414 | unsigned long inactive; |
1415 | unsigned long active; | 1415 | unsigned long active; |
1416 | unsigned long gb; | 1416 | unsigned long gb; |
1417 | 1417 | ||
1418 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1418 | inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); |
1419 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); | 1419 | active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); |
1420 | 1420 | ||
1421 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 1421 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
1422 | if (gb) | 1422 | if (gb) |
1423 | inactive_ratio = int_sqrt(10 * gb); | 1423 | inactive_ratio = int_sqrt(10 * gb); |
1424 | else | 1424 | else |
1425 | inactive_ratio = 1; | 1425 | inactive_ratio = 1; |
1426 | 1426 | ||
1427 | return inactive * inactive_ratio < active; | 1427 | return inactive * inactive_ratio < active; |
1428 | } | 1428 | } |
1429 | 1429 | ||
1430 | #define mem_cgroup_from_counter(counter, member) \ | 1430 | #define mem_cgroup_from_counter(counter, member) \ |
1431 | container_of(counter, struct mem_cgroup, member) | 1431 | container_of(counter, struct mem_cgroup, member) |
1432 | 1432 | ||
1433 | /** | 1433 | /** |
1434 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1434 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
1435 | * @memcg: the memory cgroup | 1435 | * @memcg: the memory cgroup |
1436 | * | 1436 | * |
1437 | * Returns the maximum amount of memory @mem can be charged with, in | 1437 | * Returns the maximum amount of memory @mem can be charged with, in |
1438 | * pages. | 1438 | * pages. |
1439 | */ | 1439 | */ |
1440 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | 1440 | static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) |
1441 | { | 1441 | { |
1442 | unsigned long margin = 0; | 1442 | unsigned long margin = 0; |
1443 | unsigned long count; | 1443 | unsigned long count; |
1444 | unsigned long limit; | 1444 | unsigned long limit; |
1445 | 1445 | ||
1446 | count = page_counter_read(&memcg->memory); | 1446 | count = page_counter_read(&memcg->memory); |
1447 | limit = ACCESS_ONCE(memcg->memory.limit); | 1447 | limit = ACCESS_ONCE(memcg->memory.limit); |
1448 | if (count < limit) | 1448 | if (count < limit) |
1449 | margin = limit - count; | 1449 | margin = limit - count; |
1450 | 1450 | ||
1451 | if (do_swap_account) { | 1451 | if (do_swap_account) { |
1452 | count = page_counter_read(&memcg->memsw); | 1452 | count = page_counter_read(&memcg->memsw); |
1453 | limit = ACCESS_ONCE(memcg->memsw.limit); | 1453 | limit = ACCESS_ONCE(memcg->memsw.limit); |
1454 | if (count <= limit) | 1454 | if (count <= limit) |
1455 | margin = min(margin, limit - count); | 1455 | margin = min(margin, limit - count); |
1456 | } | 1456 | } |
1457 | 1457 | ||
1458 | return margin; | 1458 | return margin; |
1459 | } | 1459 | } |
1460 | 1460 | ||
1461 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) | 1461 | int mem_cgroup_swappiness(struct mem_cgroup *memcg) |
1462 | { | 1462 | { |
1463 | /* root ? */ | 1463 | /* root ? */ |
1464 | if (mem_cgroup_disabled() || !memcg->css.parent) | 1464 | if (mem_cgroup_disabled() || !memcg->css.parent) |
1465 | return vm_swappiness; | 1465 | return vm_swappiness; |
1466 | 1466 | ||
1467 | return memcg->swappiness; | 1467 | return memcg->swappiness; |
1468 | } | 1468 | } |
1469 | 1469 | ||
1470 | /* | 1470 | /* |
1471 | * memcg->moving_account is used for checking possibility that some thread is | 1471 | * memcg->moving_account is used for checking possibility that some thread is |
1472 | * calling move_account(). When a thread on CPU-A starts moving pages under | 1472 | * calling move_account(). When a thread on CPU-A starts moving pages under |
1473 | * a memcg, other threads should check memcg->moving_account under | 1473 | * a memcg, other threads should check memcg->moving_account under |
1474 | * rcu_read_lock(), like this: | 1474 | * rcu_read_lock(), like this: |
1475 | * | 1475 | * |
1476 | * CPU-A CPU-B | 1476 | * CPU-A CPU-B |
1477 | * rcu_read_lock() | 1477 | * rcu_read_lock() |
1478 | * memcg->moving_account+1 if (memcg->mocing_account) | 1478 | * memcg->moving_account+1 if (memcg->mocing_account) |
1479 | * take heavy locks. | 1479 | * take heavy locks. |
1480 | * synchronize_rcu() update something. | 1480 | * synchronize_rcu() update something. |
1481 | * rcu_read_unlock() | 1481 | * rcu_read_unlock() |
1482 | * start move here. | 1482 | * start move here. |
1483 | */ | 1483 | */ |
1484 | 1484 | ||
1485 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) | 1485 | static void mem_cgroup_start_move(struct mem_cgroup *memcg) |
1486 | { | 1486 | { |
1487 | atomic_inc(&memcg->moving_account); | 1487 | atomic_inc(&memcg->moving_account); |
1488 | synchronize_rcu(); | 1488 | synchronize_rcu(); |
1489 | } | 1489 | } |
1490 | 1490 | ||
1491 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) | 1491 | static void mem_cgroup_end_move(struct mem_cgroup *memcg) |
1492 | { | 1492 | { |
1493 | /* | 1493 | /* |
1494 | * Now, mem_cgroup_clear_mc() may call this function with NULL. | 1494 | * Now, mem_cgroup_clear_mc() may call this function with NULL. |
1495 | * We check NULL in callee rather than caller. | 1495 | * We check NULL in callee rather than caller. |
1496 | */ | 1496 | */ |
1497 | if (memcg) | 1497 | if (memcg) |
1498 | atomic_dec(&memcg->moving_account); | 1498 | atomic_dec(&memcg->moving_account); |
1499 | } | 1499 | } |
1500 | 1500 | ||
1501 | /* | 1501 | /* |
1502 | * A routine for checking "mem" is under move_account() or not. | 1502 | * A routine for checking "mem" is under move_account() or not. |
1503 | * | 1503 | * |
1504 | * Checking a cgroup is mc.from or mc.to or under hierarchy of | 1504 | * Checking a cgroup is mc.from or mc.to or under hierarchy of |
1505 | * moving cgroups. This is for waiting at high-memory pressure | 1505 | * moving cgroups. This is for waiting at high-memory pressure |
1506 | * caused by "move". | 1506 | * caused by "move". |
1507 | */ | 1507 | */ |
1508 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) | 1508 | static bool mem_cgroup_under_move(struct mem_cgroup *memcg) |
1509 | { | 1509 | { |
1510 | struct mem_cgroup *from; | 1510 | struct mem_cgroup *from; |
1511 | struct mem_cgroup *to; | 1511 | struct mem_cgroup *to; |
1512 | bool ret = false; | 1512 | bool ret = false; |
1513 | /* | 1513 | /* |
1514 | * Unlike task_move routines, we access mc.to, mc.from not under | 1514 | * Unlike task_move routines, we access mc.to, mc.from not under |
1515 | * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. | 1515 | * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. |
1516 | */ | 1516 | */ |
1517 | spin_lock(&mc.lock); | 1517 | spin_lock(&mc.lock); |
1518 | from = mc.from; | 1518 | from = mc.from; |
1519 | to = mc.to; | 1519 | to = mc.to; |
1520 | if (!from) | 1520 | if (!from) |
1521 | goto unlock; | 1521 | goto unlock; |
1522 | 1522 | ||
1523 | ret = mem_cgroup_same_or_subtree(memcg, from) | 1523 | ret = mem_cgroup_same_or_subtree(memcg, from) |
1524 | || mem_cgroup_same_or_subtree(memcg, to); | 1524 | || mem_cgroup_same_or_subtree(memcg, to); |
1525 | unlock: | 1525 | unlock: |
1526 | spin_unlock(&mc.lock); | 1526 | spin_unlock(&mc.lock); |
1527 | return ret; | 1527 | return ret; |
1528 | } | 1528 | } |
1529 | 1529 | ||
1530 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) | 1530 | static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) |
1531 | { | 1531 | { |
1532 | if (mc.moving_task && current != mc.moving_task) { | 1532 | if (mc.moving_task && current != mc.moving_task) { |
1533 | if (mem_cgroup_under_move(memcg)) { | 1533 | if (mem_cgroup_under_move(memcg)) { |
1534 | DEFINE_WAIT(wait); | 1534 | DEFINE_WAIT(wait); |
1535 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); | 1535 | prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); |
1536 | /* moving charge context might have finished. */ | 1536 | /* moving charge context might have finished. */ |
1537 | if (mc.moving_task) | 1537 | if (mc.moving_task) |
1538 | schedule(); | 1538 | schedule(); |
1539 | finish_wait(&mc.waitq, &wait); | 1539 | finish_wait(&mc.waitq, &wait); |
1540 | return true; | 1540 | return true; |
1541 | } | 1541 | } |
1542 | } | 1542 | } |
1543 | return false; | 1543 | return false; |
1544 | } | 1544 | } |
1545 | 1545 | ||
1546 | /* | 1546 | /* |
1547 | * Take this lock when | 1547 | * Take this lock when |
1548 | * - a code tries to modify page's memcg while it's USED. | 1548 | * - a code tries to modify page's memcg while it's USED. |
1549 | * - a code tries to modify page state accounting in a memcg. | 1549 | * - a code tries to modify page state accounting in a memcg. |
1550 | */ | 1550 | */ |
1551 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, | 1551 | static void move_lock_mem_cgroup(struct mem_cgroup *memcg, |
1552 | unsigned long *flags) | 1552 | unsigned long *flags) |
1553 | { | 1553 | { |
1554 | spin_lock_irqsave(&memcg->move_lock, *flags); | 1554 | spin_lock_irqsave(&memcg->move_lock, *flags); |
1555 | } | 1555 | } |
1556 | 1556 | ||
1557 | static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, | 1557 | static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, |
1558 | unsigned long *flags) | 1558 | unsigned long *flags) |
1559 | { | 1559 | { |
1560 | spin_unlock_irqrestore(&memcg->move_lock, *flags); | 1560 | spin_unlock_irqrestore(&memcg->move_lock, *flags); |
1561 | } | 1561 | } |
1562 | 1562 | ||
1563 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 1563 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
1564 | /** | 1564 | /** |
1565 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. | 1565 | * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. |
1566 | * @memcg: The memory cgroup that went over limit | 1566 | * @memcg: The memory cgroup that went over limit |
1567 | * @p: Task that is going to be killed | 1567 | * @p: Task that is going to be killed |
1568 | * | 1568 | * |
1569 | * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is | 1569 | * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is |
1570 | * enabled | 1570 | * enabled |
1571 | */ | 1571 | */ |
1572 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | 1572 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) |
1573 | { | 1573 | { |
1574 | /* oom_info_lock ensures that parallel ooms do not interleave */ | 1574 | /* oom_info_lock ensures that parallel ooms do not interleave */ |
1575 | static DEFINE_MUTEX(oom_info_lock); | 1575 | static DEFINE_MUTEX(oom_info_lock); |
1576 | struct mem_cgroup *iter; | 1576 | struct mem_cgroup *iter; |
1577 | unsigned int i; | 1577 | unsigned int i; |
1578 | 1578 | ||
1579 | if (!p) | 1579 | if (!p) |
1580 | return; | 1580 | return; |
1581 | 1581 | ||
1582 | mutex_lock(&oom_info_lock); | 1582 | mutex_lock(&oom_info_lock); |
1583 | rcu_read_lock(); | 1583 | rcu_read_lock(); |
1584 | 1584 | ||
1585 | pr_info("Task in "); | 1585 | pr_info("Task in "); |
1586 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); | 1586 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); |
1587 | pr_info(" killed as a result of limit of "); | 1587 | pr_info(" killed as a result of limit of "); |
1588 | pr_cont_cgroup_path(memcg->css.cgroup); | 1588 | pr_cont_cgroup_path(memcg->css.cgroup); |
1589 | pr_info("\n"); | 1589 | pr_info("\n"); |
1590 | 1590 | ||
1591 | rcu_read_unlock(); | 1591 | rcu_read_unlock(); |
1592 | 1592 | ||
1593 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", | 1593 | pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", |
1594 | K((u64)page_counter_read(&memcg->memory)), | 1594 | K((u64)page_counter_read(&memcg->memory)), |
1595 | K((u64)memcg->memory.limit), memcg->memory.failcnt); | 1595 | K((u64)memcg->memory.limit), memcg->memory.failcnt); |
1596 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", | 1596 | pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", |
1597 | K((u64)page_counter_read(&memcg->memsw)), | 1597 | K((u64)page_counter_read(&memcg->memsw)), |
1598 | K((u64)memcg->memsw.limit), memcg->memsw.failcnt); | 1598 | K((u64)memcg->memsw.limit), memcg->memsw.failcnt); |
1599 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", | 1599 | pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", |
1600 | K((u64)page_counter_read(&memcg->kmem)), | 1600 | K((u64)page_counter_read(&memcg->kmem)), |
1601 | K((u64)memcg->kmem.limit), memcg->kmem.failcnt); | 1601 | K((u64)memcg->kmem.limit), memcg->kmem.failcnt); |
1602 | 1602 | ||
1603 | for_each_mem_cgroup_tree(iter, memcg) { | 1603 | for_each_mem_cgroup_tree(iter, memcg) { |
1604 | pr_info("Memory cgroup stats for "); | 1604 | pr_info("Memory cgroup stats for "); |
1605 | pr_cont_cgroup_path(iter->css.cgroup); | 1605 | pr_cont_cgroup_path(iter->css.cgroup); |
1606 | pr_cont(":"); | 1606 | pr_cont(":"); |
1607 | 1607 | ||
1608 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 1608 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
1609 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | 1609 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
1610 | continue; | 1610 | continue; |
1611 | pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], | 1611 | pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], |
1612 | K(mem_cgroup_read_stat(iter, i))); | 1612 | K(mem_cgroup_read_stat(iter, i))); |
1613 | } | 1613 | } |
1614 | 1614 | ||
1615 | for (i = 0; i < NR_LRU_LISTS; i++) | 1615 | for (i = 0; i < NR_LRU_LISTS; i++) |
1616 | pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], | 1616 | pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], |
1617 | K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); | 1617 | K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); |
1618 | 1618 | ||
1619 | pr_cont("\n"); | 1619 | pr_cont("\n"); |
1620 | } | 1620 | } |
1621 | mutex_unlock(&oom_info_lock); | 1621 | mutex_unlock(&oom_info_lock); |
1622 | } | 1622 | } |
1623 | 1623 | ||
1624 | /* | 1624 | /* |
1625 | * This function returns the number of memcg under hierarchy tree. Returns | 1625 | * This function returns the number of memcg under hierarchy tree. Returns |
1626 | * 1(self count) if no children. | 1626 | * 1(self count) if no children. |
1627 | */ | 1627 | */ |
1628 | static int mem_cgroup_count_children(struct mem_cgroup *memcg) | 1628 | static int mem_cgroup_count_children(struct mem_cgroup *memcg) |
1629 | { | 1629 | { |
1630 | int num = 0; | 1630 | int num = 0; |
1631 | struct mem_cgroup *iter; | 1631 | struct mem_cgroup *iter; |
1632 | 1632 | ||
1633 | for_each_mem_cgroup_tree(iter, memcg) | 1633 | for_each_mem_cgroup_tree(iter, memcg) |
1634 | num++; | 1634 | num++; |
1635 | return num; | 1635 | return num; |
1636 | } | 1636 | } |
1637 | 1637 | ||
1638 | /* | 1638 | /* |
1639 | * Return the memory (and swap, if configured) limit for a memcg. | 1639 | * Return the memory (and swap, if configured) limit for a memcg. |
1640 | */ | 1640 | */ |
1641 | static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) | 1641 | static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) |
1642 | { | 1642 | { |
1643 | unsigned long limit; | 1643 | unsigned long limit; |
1644 | 1644 | ||
1645 | limit = memcg->memory.limit; | 1645 | limit = memcg->memory.limit; |
1646 | if (mem_cgroup_swappiness(memcg)) { | 1646 | if (mem_cgroup_swappiness(memcg)) { |
1647 | unsigned long memsw_limit; | 1647 | unsigned long memsw_limit; |
1648 | 1648 | ||
1649 | memsw_limit = memcg->memsw.limit; | 1649 | memsw_limit = memcg->memsw.limit; |
1650 | limit = min(limit + total_swap_pages, memsw_limit); | 1650 | limit = min(limit + total_swap_pages, memsw_limit); |
1651 | } | 1651 | } |
1652 | return limit; | 1652 | return limit; |
1653 | } | 1653 | } |
1654 | 1654 | ||
1655 | static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | 1655 | static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, |
1656 | int order) | 1656 | int order) |
1657 | { | 1657 | { |
1658 | struct mem_cgroup *iter; | 1658 | struct mem_cgroup *iter; |
1659 | unsigned long chosen_points = 0; | 1659 | unsigned long chosen_points = 0; |
1660 | unsigned long totalpages; | 1660 | unsigned long totalpages; |
1661 | unsigned int points = 0; | 1661 | unsigned int points = 0; |
1662 | struct task_struct *chosen = NULL; | 1662 | struct task_struct *chosen = NULL; |
1663 | 1663 | ||
1664 | /* | 1664 | /* |
1665 | * If current has a pending SIGKILL or is exiting, then automatically | 1665 | * If current has a pending SIGKILL or is exiting, then automatically |
1666 | * select it. The goal is to allow it to allocate so that it may | 1666 | * select it. The goal is to allow it to allocate so that it may |
1667 | * quickly exit and free its memory. | 1667 | * quickly exit and free its memory. |
1668 | */ | 1668 | */ |
1669 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { | 1669 | if (fatal_signal_pending(current) || current->flags & PF_EXITING) { |
1670 | set_thread_flag(TIF_MEMDIE); | 1670 | set_thread_flag(TIF_MEMDIE); |
1671 | return; | 1671 | return; |
1672 | } | 1672 | } |
1673 | 1673 | ||
1674 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | 1674 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
1675 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; | 1675 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; |
1676 | for_each_mem_cgroup_tree(iter, memcg) { | 1676 | for_each_mem_cgroup_tree(iter, memcg) { |
1677 | struct css_task_iter it; | 1677 | struct css_task_iter it; |
1678 | struct task_struct *task; | 1678 | struct task_struct *task; |
1679 | 1679 | ||
1680 | css_task_iter_start(&iter->css, &it); | 1680 | css_task_iter_start(&iter->css, &it); |
1681 | while ((task = css_task_iter_next(&it))) { | 1681 | while ((task = css_task_iter_next(&it))) { |
1682 | switch (oom_scan_process_thread(task, totalpages, NULL, | 1682 | switch (oom_scan_process_thread(task, totalpages, NULL, |
1683 | false)) { | 1683 | false)) { |
1684 | case OOM_SCAN_SELECT: | 1684 | case OOM_SCAN_SELECT: |
1685 | if (chosen) | 1685 | if (chosen) |
1686 | put_task_struct(chosen); | 1686 | put_task_struct(chosen); |
1687 | chosen = task; | 1687 | chosen = task; |
1688 | chosen_points = ULONG_MAX; | 1688 | chosen_points = ULONG_MAX; |
1689 | get_task_struct(chosen); | 1689 | get_task_struct(chosen); |
1690 | /* fall through */ | 1690 | /* fall through */ |
1691 | case OOM_SCAN_CONTINUE: | 1691 | case OOM_SCAN_CONTINUE: |
1692 | continue; | 1692 | continue; |
1693 | case OOM_SCAN_ABORT: | 1693 | case OOM_SCAN_ABORT: |
1694 | css_task_iter_end(&it); | 1694 | css_task_iter_end(&it); |
1695 | mem_cgroup_iter_break(memcg, iter); | 1695 | mem_cgroup_iter_break(memcg, iter); |
1696 | if (chosen) | 1696 | if (chosen) |
1697 | put_task_struct(chosen); | 1697 | put_task_struct(chosen); |
1698 | return; | 1698 | return; |
1699 | case OOM_SCAN_OK: | 1699 | case OOM_SCAN_OK: |
1700 | break; | 1700 | break; |
1701 | }; | 1701 | }; |
1702 | points = oom_badness(task, memcg, NULL, totalpages); | 1702 | points = oom_badness(task, memcg, NULL, totalpages); |
1703 | if (!points || points < chosen_points) | 1703 | if (!points || points < chosen_points) |
1704 | continue; | 1704 | continue; |
1705 | /* Prefer thread group leaders for display purposes */ | 1705 | /* Prefer thread group leaders for display purposes */ |
1706 | if (points == chosen_points && | 1706 | if (points == chosen_points && |
1707 | thread_group_leader(chosen)) | 1707 | thread_group_leader(chosen)) |
1708 | continue; | 1708 | continue; |
1709 | 1709 | ||
1710 | if (chosen) | 1710 | if (chosen) |
1711 | put_task_struct(chosen); | 1711 | put_task_struct(chosen); |
1712 | chosen = task; | 1712 | chosen = task; |
1713 | chosen_points = points; | 1713 | chosen_points = points; |
1714 | get_task_struct(chosen); | 1714 | get_task_struct(chosen); |
1715 | } | 1715 | } |
1716 | css_task_iter_end(&it); | 1716 | css_task_iter_end(&it); |
1717 | } | 1717 | } |
1718 | 1718 | ||
1719 | if (!chosen) | 1719 | if (!chosen) |
1720 | return; | 1720 | return; |
1721 | points = chosen_points * 1000 / totalpages; | 1721 | points = chosen_points * 1000 / totalpages; |
1722 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, | 1722 | oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, |
1723 | NULL, "Memory cgroup out of memory"); | 1723 | NULL, "Memory cgroup out of memory"); |
1724 | } | 1724 | } |
1725 | 1725 | ||
1726 | /** | 1726 | /** |
1727 | * test_mem_cgroup_node_reclaimable | 1727 | * test_mem_cgroup_node_reclaimable |
1728 | * @memcg: the target memcg | 1728 | * @memcg: the target memcg |
1729 | * @nid: the node ID to be checked. | 1729 | * @nid: the node ID to be checked. |
1730 | * @noswap : specify true here if the user wants flle only information. | 1730 | * @noswap : specify true here if the user wants flle only information. |
1731 | * | 1731 | * |
1732 | * This function returns whether the specified memcg contains any | 1732 | * This function returns whether the specified memcg contains any |
1733 | * reclaimable pages on a node. Returns true if there are any reclaimable | 1733 | * reclaimable pages on a node. Returns true if there are any reclaimable |
1734 | * pages in the node. | 1734 | * pages in the node. |
1735 | */ | 1735 | */ |
1736 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, | 1736 | static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, |
1737 | int nid, bool noswap) | 1737 | int nid, bool noswap) |
1738 | { | 1738 | { |
1739 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) | 1739 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) |
1740 | return true; | 1740 | return true; |
1741 | if (noswap || !total_swap_pages) | 1741 | if (noswap || !total_swap_pages) |
1742 | return false; | 1742 | return false; |
1743 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) | 1743 | if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) |
1744 | return true; | 1744 | return true; |
1745 | return false; | 1745 | return false; |
1746 | 1746 | ||
1747 | } | 1747 | } |
1748 | #if MAX_NUMNODES > 1 | 1748 | #if MAX_NUMNODES > 1 |
1749 | 1749 | ||
1750 | /* | 1750 | /* |
1751 | * Always updating the nodemask is not very good - even if we have an empty | 1751 | * Always updating the nodemask is not very good - even if we have an empty |
1752 | * list or the wrong list here, we can start from some node and traverse all | 1752 | * list or the wrong list here, we can start from some node and traverse all |
1753 | * nodes based on the zonelist. So update the list loosely once per 10 secs. | 1753 | * nodes based on the zonelist. So update the list loosely once per 10 secs. |
1754 | * | 1754 | * |
1755 | */ | 1755 | */ |
1756 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) | 1756 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) |
1757 | { | 1757 | { |
1758 | int nid; | 1758 | int nid; |
1759 | /* | 1759 | /* |
1760 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET | 1760 | * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET |
1761 | * pagein/pageout changes since the last update. | 1761 | * pagein/pageout changes since the last update. |
1762 | */ | 1762 | */ |
1763 | if (!atomic_read(&memcg->numainfo_events)) | 1763 | if (!atomic_read(&memcg->numainfo_events)) |
1764 | return; | 1764 | return; |
1765 | if (atomic_inc_return(&memcg->numainfo_updating) > 1) | 1765 | if (atomic_inc_return(&memcg->numainfo_updating) > 1) |
1766 | return; | 1766 | return; |
1767 | 1767 | ||
1768 | /* make a nodemask where this memcg uses memory from */ | 1768 | /* make a nodemask where this memcg uses memory from */ |
1769 | memcg->scan_nodes = node_states[N_MEMORY]; | 1769 | memcg->scan_nodes = node_states[N_MEMORY]; |
1770 | 1770 | ||
1771 | for_each_node_mask(nid, node_states[N_MEMORY]) { | 1771 | for_each_node_mask(nid, node_states[N_MEMORY]) { |
1772 | 1772 | ||
1773 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) | 1773 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) |
1774 | node_clear(nid, memcg->scan_nodes); | 1774 | node_clear(nid, memcg->scan_nodes); |
1775 | } | 1775 | } |
1776 | 1776 | ||
1777 | atomic_set(&memcg->numainfo_events, 0); | 1777 | atomic_set(&memcg->numainfo_events, 0); |
1778 | atomic_set(&memcg->numainfo_updating, 0); | 1778 | atomic_set(&memcg->numainfo_updating, 0); |
1779 | } | 1779 | } |
1780 | 1780 | ||
1781 | /* | 1781 | /* |
1782 | * Selecting a node where we start reclaim from. Because what we need is just | 1782 | * Selecting a node where we start reclaim from. Because what we need is just |
1783 | * reducing usage counter, start from anywhere is O,K. Considering | 1783 | * reducing usage counter, start from anywhere is O,K. Considering |
1784 | * memory reclaim from current node, there are pros. and cons. | 1784 | * memory reclaim from current node, there are pros. and cons. |
1785 | * | 1785 | * |
1786 | * Freeing memory from current node means freeing memory from a node which | 1786 | * Freeing memory from current node means freeing memory from a node which |
1787 | * we'll use or we've used. So, it may make LRU bad. And if several threads | 1787 | * we'll use or we've used. So, it may make LRU bad. And if several threads |
1788 | * hit limits, it will see a contention on a node. But freeing from remote | 1788 | * hit limits, it will see a contention on a node. But freeing from remote |
1789 | * node means more costs for memory reclaim because of memory latency. | 1789 | * node means more costs for memory reclaim because of memory latency. |
1790 | * | 1790 | * |
1791 | * Now, we use round-robin. Better algorithm is welcomed. | 1791 | * Now, we use round-robin. Better algorithm is welcomed. |
1792 | */ | 1792 | */ |
1793 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1793 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1794 | { | 1794 | { |
1795 | int node; | 1795 | int node; |
1796 | 1796 | ||
1797 | mem_cgroup_may_update_nodemask(memcg); | 1797 | mem_cgroup_may_update_nodemask(memcg); |
1798 | node = memcg->last_scanned_node; | 1798 | node = memcg->last_scanned_node; |
1799 | 1799 | ||
1800 | node = next_node(node, memcg->scan_nodes); | 1800 | node = next_node(node, memcg->scan_nodes); |
1801 | if (node == MAX_NUMNODES) | 1801 | if (node == MAX_NUMNODES) |
1802 | node = first_node(memcg->scan_nodes); | 1802 | node = first_node(memcg->scan_nodes); |
1803 | /* | 1803 | /* |
1804 | * We call this when we hit limit, not when pages are added to LRU. | 1804 | * We call this when we hit limit, not when pages are added to LRU. |
1805 | * No LRU may hold pages because all pages are UNEVICTABLE or | 1805 | * No LRU may hold pages because all pages are UNEVICTABLE or |
1806 | * memcg is too small and all pages are not on LRU. In that case, | 1806 | * memcg is too small and all pages are not on LRU. In that case, |
1807 | * we use curret node. | 1807 | * we use curret node. |
1808 | */ | 1808 | */ |
1809 | if (unlikely(node == MAX_NUMNODES)) | 1809 | if (unlikely(node == MAX_NUMNODES)) |
1810 | node = numa_node_id(); | 1810 | node = numa_node_id(); |
1811 | 1811 | ||
1812 | memcg->last_scanned_node = node; | 1812 | memcg->last_scanned_node = node; |
1813 | return node; | 1813 | return node; |
1814 | } | 1814 | } |
1815 | 1815 | ||
1816 | /* | 1816 | /* |
1817 | * Check all nodes whether it contains reclaimable pages or not. | 1817 | * Check all nodes whether it contains reclaimable pages or not. |
1818 | * For quick scan, we make use of scan_nodes. This will allow us to skip | 1818 | * For quick scan, we make use of scan_nodes. This will allow us to skip |
1819 | * unused nodes. But scan_nodes is lazily updated and may not cotain | 1819 | * unused nodes. But scan_nodes is lazily updated and may not cotain |
1820 | * enough new information. We need to do double check. | 1820 | * enough new information. We need to do double check. |
1821 | */ | 1821 | */ |
1822 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | 1822 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1823 | { | 1823 | { |
1824 | int nid; | 1824 | int nid; |
1825 | 1825 | ||
1826 | /* | 1826 | /* |
1827 | * quick check...making use of scan_node. | 1827 | * quick check...making use of scan_node. |
1828 | * We can skip unused nodes. | 1828 | * We can skip unused nodes. |
1829 | */ | 1829 | */ |
1830 | if (!nodes_empty(memcg->scan_nodes)) { | 1830 | if (!nodes_empty(memcg->scan_nodes)) { |
1831 | for (nid = first_node(memcg->scan_nodes); | 1831 | for (nid = first_node(memcg->scan_nodes); |
1832 | nid < MAX_NUMNODES; | 1832 | nid < MAX_NUMNODES; |
1833 | nid = next_node(nid, memcg->scan_nodes)) { | 1833 | nid = next_node(nid, memcg->scan_nodes)) { |
1834 | 1834 | ||
1835 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | 1835 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
1836 | return true; | 1836 | return true; |
1837 | } | 1837 | } |
1838 | } | 1838 | } |
1839 | /* | 1839 | /* |
1840 | * Check rest of nodes. | 1840 | * Check rest of nodes. |
1841 | */ | 1841 | */ |
1842 | for_each_node_state(nid, N_MEMORY) { | 1842 | for_each_node_state(nid, N_MEMORY) { |
1843 | if (node_isset(nid, memcg->scan_nodes)) | 1843 | if (node_isset(nid, memcg->scan_nodes)) |
1844 | continue; | 1844 | continue; |
1845 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | 1845 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
1846 | return true; | 1846 | return true; |
1847 | } | 1847 | } |
1848 | return false; | 1848 | return false; |
1849 | } | 1849 | } |
1850 | 1850 | ||
1851 | #else | 1851 | #else |
1852 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) | 1852 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) |
1853 | { | 1853 | { |
1854 | return 0; | 1854 | return 0; |
1855 | } | 1855 | } |
1856 | 1856 | ||
1857 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | 1857 | static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) |
1858 | { | 1858 | { |
1859 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); | 1859 | return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); |
1860 | } | 1860 | } |
1861 | #endif | 1861 | #endif |
1862 | 1862 | ||
1863 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, | 1863 | static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, |
1864 | struct zone *zone, | 1864 | struct zone *zone, |
1865 | gfp_t gfp_mask, | 1865 | gfp_t gfp_mask, |
1866 | unsigned long *total_scanned) | 1866 | unsigned long *total_scanned) |
1867 | { | 1867 | { |
1868 | struct mem_cgroup *victim = NULL; | 1868 | struct mem_cgroup *victim = NULL; |
1869 | int total = 0; | 1869 | int total = 0; |
1870 | int loop = 0; | 1870 | int loop = 0; |
1871 | unsigned long excess; | 1871 | unsigned long excess; |
1872 | unsigned long nr_scanned; | 1872 | unsigned long nr_scanned; |
1873 | struct mem_cgroup_reclaim_cookie reclaim = { | 1873 | struct mem_cgroup_reclaim_cookie reclaim = { |
1874 | .zone = zone, | 1874 | .zone = zone, |
1875 | .priority = 0, | 1875 | .priority = 0, |
1876 | }; | 1876 | }; |
1877 | 1877 | ||
1878 | excess = soft_limit_excess(root_memcg); | 1878 | excess = soft_limit_excess(root_memcg); |
1879 | 1879 | ||
1880 | while (1) { | 1880 | while (1) { |
1881 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); | 1881 | victim = mem_cgroup_iter(root_memcg, victim, &reclaim); |
1882 | if (!victim) { | 1882 | if (!victim) { |
1883 | loop++; | 1883 | loop++; |
1884 | if (loop >= 2) { | 1884 | if (loop >= 2) { |
1885 | /* | 1885 | /* |
1886 | * If we have not been able to reclaim | 1886 | * If we have not been able to reclaim |
1887 | * anything, it might because there are | 1887 | * anything, it might because there are |
1888 | * no reclaimable pages under this hierarchy | 1888 | * no reclaimable pages under this hierarchy |
1889 | */ | 1889 | */ |
1890 | if (!total) | 1890 | if (!total) |
1891 | break; | 1891 | break; |
1892 | /* | 1892 | /* |
1893 | * We want to do more targeted reclaim. | 1893 | * We want to do more targeted reclaim. |
1894 | * excess >> 2 is not to excessive so as to | 1894 | * excess >> 2 is not to excessive so as to |
1895 | * reclaim too much, nor too less that we keep | 1895 | * reclaim too much, nor too less that we keep |
1896 | * coming back to reclaim from this cgroup | 1896 | * coming back to reclaim from this cgroup |
1897 | */ | 1897 | */ |
1898 | if (total >= (excess >> 2) || | 1898 | if (total >= (excess >> 2) || |
1899 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) | 1899 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) |
1900 | break; | 1900 | break; |
1901 | } | 1901 | } |
1902 | continue; | 1902 | continue; |
1903 | } | 1903 | } |
1904 | if (!mem_cgroup_reclaimable(victim, false)) | 1904 | if (!mem_cgroup_reclaimable(victim, false)) |
1905 | continue; | 1905 | continue; |
1906 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, | 1906 | total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, |
1907 | zone, &nr_scanned); | 1907 | zone, &nr_scanned); |
1908 | *total_scanned += nr_scanned; | 1908 | *total_scanned += nr_scanned; |
1909 | if (!soft_limit_excess(root_memcg)) | 1909 | if (!soft_limit_excess(root_memcg)) |
1910 | break; | 1910 | break; |
1911 | } | 1911 | } |
1912 | mem_cgroup_iter_break(root_memcg, victim); | 1912 | mem_cgroup_iter_break(root_memcg, victim); |
1913 | return total; | 1913 | return total; |
1914 | } | 1914 | } |
1915 | 1915 | ||
1916 | #ifdef CONFIG_LOCKDEP | 1916 | #ifdef CONFIG_LOCKDEP |
1917 | static struct lockdep_map memcg_oom_lock_dep_map = { | 1917 | static struct lockdep_map memcg_oom_lock_dep_map = { |
1918 | .name = "memcg_oom_lock", | 1918 | .name = "memcg_oom_lock", |
1919 | }; | 1919 | }; |
1920 | #endif | 1920 | #endif |
1921 | 1921 | ||
1922 | static DEFINE_SPINLOCK(memcg_oom_lock); | 1922 | static DEFINE_SPINLOCK(memcg_oom_lock); |
1923 | 1923 | ||
1924 | /* | 1924 | /* |
1925 | * Check OOM-Killer is already running under our hierarchy. | 1925 | * Check OOM-Killer is already running under our hierarchy. |
1926 | * If someone is running, return false. | 1926 | * If someone is running, return false. |
1927 | */ | 1927 | */ |
1928 | static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) | 1928 | static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) |
1929 | { | 1929 | { |
1930 | struct mem_cgroup *iter, *failed = NULL; | 1930 | struct mem_cgroup *iter, *failed = NULL; |
1931 | 1931 | ||
1932 | spin_lock(&memcg_oom_lock); | 1932 | spin_lock(&memcg_oom_lock); |
1933 | 1933 | ||
1934 | for_each_mem_cgroup_tree(iter, memcg) { | 1934 | for_each_mem_cgroup_tree(iter, memcg) { |
1935 | if (iter->oom_lock) { | 1935 | if (iter->oom_lock) { |
1936 | /* | 1936 | /* |
1937 | * this subtree of our hierarchy is already locked | 1937 | * this subtree of our hierarchy is already locked |
1938 | * so we cannot give a lock. | 1938 | * so we cannot give a lock. |
1939 | */ | 1939 | */ |
1940 | failed = iter; | 1940 | failed = iter; |
1941 | mem_cgroup_iter_break(memcg, iter); | 1941 | mem_cgroup_iter_break(memcg, iter); |
1942 | break; | 1942 | break; |
1943 | } else | 1943 | } else |
1944 | iter->oom_lock = true; | 1944 | iter->oom_lock = true; |
1945 | } | 1945 | } |
1946 | 1946 | ||
1947 | if (failed) { | 1947 | if (failed) { |
1948 | /* | 1948 | /* |
1949 | * OK, we failed to lock the whole subtree so we have | 1949 | * OK, we failed to lock the whole subtree so we have |
1950 | * to clean up what we set up to the failing subtree | 1950 | * to clean up what we set up to the failing subtree |
1951 | */ | 1951 | */ |
1952 | for_each_mem_cgroup_tree(iter, memcg) { | 1952 | for_each_mem_cgroup_tree(iter, memcg) { |
1953 | if (iter == failed) { | 1953 | if (iter == failed) { |
1954 | mem_cgroup_iter_break(memcg, iter); | 1954 | mem_cgroup_iter_break(memcg, iter); |
1955 | break; | 1955 | break; |
1956 | } | 1956 | } |
1957 | iter->oom_lock = false; | 1957 | iter->oom_lock = false; |
1958 | } | 1958 | } |
1959 | } else | 1959 | } else |
1960 | mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); | 1960 | mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); |
1961 | 1961 | ||
1962 | spin_unlock(&memcg_oom_lock); | 1962 | spin_unlock(&memcg_oom_lock); |
1963 | 1963 | ||
1964 | return !failed; | 1964 | return !failed; |
1965 | } | 1965 | } |
1966 | 1966 | ||
1967 | static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) | 1967 | static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) |
1968 | { | 1968 | { |
1969 | struct mem_cgroup *iter; | 1969 | struct mem_cgroup *iter; |
1970 | 1970 | ||
1971 | spin_lock(&memcg_oom_lock); | 1971 | spin_lock(&memcg_oom_lock); |
1972 | mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); | 1972 | mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); |
1973 | for_each_mem_cgroup_tree(iter, memcg) | 1973 | for_each_mem_cgroup_tree(iter, memcg) |
1974 | iter->oom_lock = false; | 1974 | iter->oom_lock = false; |
1975 | spin_unlock(&memcg_oom_lock); | 1975 | spin_unlock(&memcg_oom_lock); |
1976 | } | 1976 | } |
1977 | 1977 | ||
1978 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) | 1978 | static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) |
1979 | { | 1979 | { |
1980 | struct mem_cgroup *iter; | 1980 | struct mem_cgroup *iter; |
1981 | 1981 | ||
1982 | for_each_mem_cgroup_tree(iter, memcg) | 1982 | for_each_mem_cgroup_tree(iter, memcg) |
1983 | atomic_inc(&iter->under_oom); | 1983 | atomic_inc(&iter->under_oom); |
1984 | } | 1984 | } |
1985 | 1985 | ||
1986 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) | 1986 | static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) |
1987 | { | 1987 | { |
1988 | struct mem_cgroup *iter; | 1988 | struct mem_cgroup *iter; |
1989 | 1989 | ||
1990 | /* | 1990 | /* |
1991 | * When a new child is created while the hierarchy is under oom, | 1991 | * When a new child is created while the hierarchy is under oom, |
1992 | * mem_cgroup_oom_lock() may not be called. We have to use | 1992 | * mem_cgroup_oom_lock() may not be called. We have to use |
1993 | * atomic_add_unless() here. | 1993 | * atomic_add_unless() here. |
1994 | */ | 1994 | */ |
1995 | for_each_mem_cgroup_tree(iter, memcg) | 1995 | for_each_mem_cgroup_tree(iter, memcg) |
1996 | atomic_add_unless(&iter->under_oom, -1, 0); | 1996 | atomic_add_unless(&iter->under_oom, -1, 0); |
1997 | } | 1997 | } |
1998 | 1998 | ||
1999 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1999 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
2000 | 2000 | ||
2001 | struct oom_wait_info { | 2001 | struct oom_wait_info { |
2002 | struct mem_cgroup *memcg; | 2002 | struct mem_cgroup *memcg; |
2003 | wait_queue_t wait; | 2003 | wait_queue_t wait; |
2004 | }; | 2004 | }; |
2005 | 2005 | ||
2006 | static int memcg_oom_wake_function(wait_queue_t *wait, | 2006 | static int memcg_oom_wake_function(wait_queue_t *wait, |
2007 | unsigned mode, int sync, void *arg) | 2007 | unsigned mode, int sync, void *arg) |
2008 | { | 2008 | { |
2009 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; | 2009 | struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; |
2010 | struct mem_cgroup *oom_wait_memcg; | 2010 | struct mem_cgroup *oom_wait_memcg; |
2011 | struct oom_wait_info *oom_wait_info; | 2011 | struct oom_wait_info *oom_wait_info; |
2012 | 2012 | ||
2013 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | 2013 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); |
2014 | oom_wait_memcg = oom_wait_info->memcg; | 2014 | oom_wait_memcg = oom_wait_info->memcg; |
2015 | 2015 | ||
2016 | /* | 2016 | /* |
2017 | * Both of oom_wait_info->memcg and wake_memcg are stable under us. | 2017 | * Both of oom_wait_info->memcg and wake_memcg are stable under us. |
2018 | * Then we can use css_is_ancestor without taking care of RCU. | 2018 | * Then we can use css_is_ancestor without taking care of RCU. |
2019 | */ | 2019 | */ |
2020 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) | 2020 | if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) |
2021 | && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) | 2021 | && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) |
2022 | return 0; | 2022 | return 0; |
2023 | return autoremove_wake_function(wait, mode, sync, arg); | 2023 | return autoremove_wake_function(wait, mode, sync, arg); |
2024 | } | 2024 | } |
2025 | 2025 | ||
2026 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) | 2026 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) |
2027 | { | 2027 | { |
2028 | atomic_inc(&memcg->oom_wakeups); | 2028 | atomic_inc(&memcg->oom_wakeups); |
2029 | /* for filtering, pass "memcg" as argument. */ | 2029 | /* for filtering, pass "memcg" as argument. */ |
2030 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | 2030 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); |
2031 | } | 2031 | } |
2032 | 2032 | ||
2033 | static void memcg_oom_recover(struct mem_cgroup *memcg) | 2033 | static void memcg_oom_recover(struct mem_cgroup *memcg) |
2034 | { | 2034 | { |
2035 | if (memcg && atomic_read(&memcg->under_oom)) | 2035 | if (memcg && atomic_read(&memcg->under_oom)) |
2036 | memcg_wakeup_oom(memcg); | 2036 | memcg_wakeup_oom(memcg); |
2037 | } | 2037 | } |
2038 | 2038 | ||
2039 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 2039 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2040 | { | 2040 | { |
2041 | if (!current->memcg_oom.may_oom) | 2041 | if (!current->memcg_oom.may_oom) |
2042 | return; | 2042 | return; |
2043 | /* | 2043 | /* |
2044 | * We are in the middle of the charge context here, so we | 2044 | * We are in the middle of the charge context here, so we |
2045 | * don't want to block when potentially sitting on a callstack | 2045 | * don't want to block when potentially sitting on a callstack |
2046 | * that holds all kinds of filesystem and mm locks. | 2046 | * that holds all kinds of filesystem and mm locks. |
2047 | * | 2047 | * |
2048 | * Also, the caller may handle a failed allocation gracefully | 2048 | * Also, the caller may handle a failed allocation gracefully |
2049 | * (like optional page cache readahead) and so an OOM killer | 2049 | * (like optional page cache readahead) and so an OOM killer |
2050 | * invocation might not even be necessary. | 2050 | * invocation might not even be necessary. |
2051 | * | 2051 | * |
2052 | * That's why we don't do anything here except remember the | 2052 | * That's why we don't do anything here except remember the |
2053 | * OOM context and then deal with it at the end of the page | 2053 | * OOM context and then deal with it at the end of the page |
2054 | * fault when the stack is unwound, the locks are released, | 2054 | * fault when the stack is unwound, the locks are released, |
2055 | * and when we know whether the fault was overall successful. | 2055 | * and when we know whether the fault was overall successful. |
2056 | */ | 2056 | */ |
2057 | css_get(&memcg->css); | 2057 | css_get(&memcg->css); |
2058 | current->memcg_oom.memcg = memcg; | 2058 | current->memcg_oom.memcg = memcg; |
2059 | current->memcg_oom.gfp_mask = mask; | 2059 | current->memcg_oom.gfp_mask = mask; |
2060 | current->memcg_oom.order = order; | 2060 | current->memcg_oom.order = order; |
2061 | } | 2061 | } |
2062 | 2062 | ||
2063 | /** | 2063 | /** |
2064 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | 2064 | * mem_cgroup_oom_synchronize - complete memcg OOM handling |
2065 | * @handle: actually kill/wait or just clean up the OOM state | 2065 | * @handle: actually kill/wait or just clean up the OOM state |
2066 | * | 2066 | * |
2067 | * This has to be called at the end of a page fault if the memcg OOM | 2067 | * This has to be called at the end of a page fault if the memcg OOM |
2068 | * handler was enabled. | 2068 | * handler was enabled. |
2069 | * | 2069 | * |
2070 | * Memcg supports userspace OOM handling where failed allocations must | 2070 | * Memcg supports userspace OOM handling where failed allocations must |
2071 | * sleep on a waitqueue until the userspace task resolves the | 2071 | * sleep on a waitqueue until the userspace task resolves the |
2072 | * situation. Sleeping directly in the charge context with all kinds | 2072 | * situation. Sleeping directly in the charge context with all kinds |
2073 | * of locks held is not a good idea, instead we remember an OOM state | 2073 | * of locks held is not a good idea, instead we remember an OOM state |
2074 | * in the task and mem_cgroup_oom_synchronize() has to be called at | 2074 | * in the task and mem_cgroup_oom_synchronize() has to be called at |
2075 | * the end of the page fault to complete the OOM handling. | 2075 | * the end of the page fault to complete the OOM handling. |
2076 | * | 2076 | * |
2077 | * Returns %true if an ongoing memcg OOM situation was detected and | 2077 | * Returns %true if an ongoing memcg OOM situation was detected and |
2078 | * completed, %false otherwise. | 2078 | * completed, %false otherwise. |
2079 | */ | 2079 | */ |
2080 | bool mem_cgroup_oom_synchronize(bool handle) | 2080 | bool mem_cgroup_oom_synchronize(bool handle) |
2081 | { | 2081 | { |
2082 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | 2082 | struct mem_cgroup *memcg = current->memcg_oom.memcg; |
2083 | struct oom_wait_info owait; | 2083 | struct oom_wait_info owait; |
2084 | bool locked; | 2084 | bool locked; |
2085 | 2085 | ||
2086 | /* OOM is global, do not handle */ | 2086 | /* OOM is global, do not handle */ |
2087 | if (!memcg) | 2087 | if (!memcg) |
2088 | return false; | 2088 | return false; |
2089 | 2089 | ||
2090 | if (!handle) | 2090 | if (!handle) |
2091 | goto cleanup; | 2091 | goto cleanup; |
2092 | 2092 | ||
2093 | owait.memcg = memcg; | 2093 | owait.memcg = memcg; |
2094 | owait.wait.flags = 0; | 2094 | owait.wait.flags = 0; |
2095 | owait.wait.func = memcg_oom_wake_function; | 2095 | owait.wait.func = memcg_oom_wake_function; |
2096 | owait.wait.private = current; | 2096 | owait.wait.private = current; |
2097 | INIT_LIST_HEAD(&owait.wait.task_list); | 2097 | INIT_LIST_HEAD(&owait.wait.task_list); |
2098 | 2098 | ||
2099 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2099 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
2100 | mem_cgroup_mark_under_oom(memcg); | 2100 | mem_cgroup_mark_under_oom(memcg); |
2101 | 2101 | ||
2102 | locked = mem_cgroup_oom_trylock(memcg); | 2102 | locked = mem_cgroup_oom_trylock(memcg); |
2103 | 2103 | ||
2104 | if (locked) | 2104 | if (locked) |
2105 | mem_cgroup_oom_notify(memcg); | 2105 | mem_cgroup_oom_notify(memcg); |
2106 | 2106 | ||
2107 | if (locked && !memcg->oom_kill_disable) { | 2107 | if (locked && !memcg->oom_kill_disable) { |
2108 | mem_cgroup_unmark_under_oom(memcg); | 2108 | mem_cgroup_unmark_under_oom(memcg); |
2109 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2109 | finish_wait(&memcg_oom_waitq, &owait.wait); |
2110 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | 2110 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, |
2111 | current->memcg_oom.order); | 2111 | current->memcg_oom.order); |
2112 | } else { | 2112 | } else { |
2113 | schedule(); | 2113 | schedule(); |
2114 | mem_cgroup_unmark_under_oom(memcg); | 2114 | mem_cgroup_unmark_under_oom(memcg); |
2115 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2115 | finish_wait(&memcg_oom_waitq, &owait.wait); |
2116 | } | 2116 | } |
2117 | 2117 | ||
2118 | if (locked) { | 2118 | if (locked) { |
2119 | mem_cgroup_oom_unlock(memcg); | 2119 | mem_cgroup_oom_unlock(memcg); |
2120 | /* | 2120 | /* |
2121 | * There is no guarantee that an OOM-lock contender | 2121 | * There is no guarantee that an OOM-lock contender |
2122 | * sees the wakeups triggered by the OOM kill | 2122 | * sees the wakeups triggered by the OOM kill |
2123 | * uncharges. Wake any sleepers explicitely. | 2123 | * uncharges. Wake any sleepers explicitely. |
2124 | */ | 2124 | */ |
2125 | memcg_oom_recover(memcg); | 2125 | memcg_oom_recover(memcg); |
2126 | } | 2126 | } |
2127 | cleanup: | 2127 | cleanup: |
2128 | current->memcg_oom.memcg = NULL; | 2128 | current->memcg_oom.memcg = NULL; |
2129 | css_put(&memcg->css); | 2129 | css_put(&memcg->css); |
2130 | return true; | 2130 | return true; |
2131 | } | 2131 | } |
2132 | 2132 | ||
2133 | /** | 2133 | /** |
2134 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction | 2134 | * mem_cgroup_begin_page_stat - begin a page state statistics transaction |
2135 | * @page: page that is going to change accounted state | 2135 | * @page: page that is going to change accounted state |
2136 | * @locked: &memcg->move_lock slowpath was taken | 2136 | * @locked: &memcg->move_lock slowpath was taken |
2137 | * @flags: IRQ-state flags for &memcg->move_lock | 2137 | * @flags: IRQ-state flags for &memcg->move_lock |
2138 | * | 2138 | * |
2139 | * This function must mark the beginning of an accounted page state | 2139 | * This function must mark the beginning of an accounted page state |
2140 | * change to prevent double accounting when the page is concurrently | 2140 | * change to prevent double accounting when the page is concurrently |
2141 | * being moved to another memcg: | 2141 | * being moved to another memcg: |
2142 | * | 2142 | * |
2143 | * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); | 2143 | * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags); |
2144 | * if (TestClearPageState(page)) | 2144 | * if (TestClearPageState(page)) |
2145 | * mem_cgroup_update_page_stat(memcg, state, -1); | 2145 | * mem_cgroup_update_page_stat(memcg, state, -1); |
2146 | * mem_cgroup_end_page_stat(memcg, locked, flags); | 2146 | * mem_cgroup_end_page_stat(memcg, locked, flags); |
2147 | * | 2147 | * |
2148 | * The RCU lock is held throughout the transaction. The fast path can | 2148 | * The RCU lock is held throughout the transaction. The fast path can |
2149 | * get away without acquiring the memcg->move_lock (@locked is false) | 2149 | * get away without acquiring the memcg->move_lock (@locked is false) |
2150 | * because page moving starts with an RCU grace period. | 2150 | * because page moving starts with an RCU grace period. |
2151 | * | 2151 | * |
2152 | * The RCU lock also protects the memcg from being freed when the page | 2152 | * The RCU lock also protects the memcg from being freed when the page |
2153 | * state that is going to change is the only thing preventing the page | 2153 | * state that is going to change is the only thing preventing the page |
2154 | * from being uncharged. E.g. end-writeback clearing PageWriteback(), | 2154 | * from being uncharged. E.g. end-writeback clearing PageWriteback(), |
2155 | * which allows migration to go ahead and uncharge the page before the | 2155 | * which allows migration to go ahead and uncharge the page before the |
2156 | * account transaction might be complete. | 2156 | * account transaction might be complete. |
2157 | */ | 2157 | */ |
2158 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, | 2158 | struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, |
2159 | bool *locked, | 2159 | bool *locked, |
2160 | unsigned long *flags) | 2160 | unsigned long *flags) |
2161 | { | 2161 | { |
2162 | struct mem_cgroup *memcg; | 2162 | struct mem_cgroup *memcg; |
2163 | struct page_cgroup *pc; | 2163 | struct page_cgroup *pc; |
2164 | 2164 | ||
2165 | rcu_read_lock(); | 2165 | rcu_read_lock(); |
2166 | 2166 | ||
2167 | if (mem_cgroup_disabled()) | 2167 | if (mem_cgroup_disabled()) |
2168 | return NULL; | 2168 | return NULL; |
2169 | 2169 | ||
2170 | pc = lookup_page_cgroup(page); | 2170 | pc = lookup_page_cgroup(page); |
2171 | again: | 2171 | again: |
2172 | memcg = pc->mem_cgroup; | 2172 | memcg = pc->mem_cgroup; |
2173 | if (unlikely(!memcg || !PageCgroupUsed(pc))) | 2173 | if (unlikely(!memcg || !PageCgroupUsed(pc))) |
2174 | return NULL; | 2174 | return NULL; |
2175 | 2175 | ||
2176 | *locked = false; | 2176 | *locked = false; |
2177 | if (atomic_read(&memcg->moving_account) <= 0) | 2177 | if (atomic_read(&memcg->moving_account) <= 0) |
2178 | return memcg; | 2178 | return memcg; |
2179 | 2179 | ||
2180 | move_lock_mem_cgroup(memcg, flags); | 2180 | move_lock_mem_cgroup(memcg, flags); |
2181 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { | 2181 | if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { |
2182 | move_unlock_mem_cgroup(memcg, flags); | 2182 | move_unlock_mem_cgroup(memcg, flags); |
2183 | goto again; | 2183 | goto again; |
2184 | } | 2184 | } |
2185 | *locked = true; | 2185 | *locked = true; |
2186 | 2186 | ||
2187 | return memcg; | 2187 | return memcg; |
2188 | } | 2188 | } |
2189 | 2189 | ||
2190 | /** | 2190 | /** |
2191 | * mem_cgroup_end_page_stat - finish a page state statistics transaction | 2191 | * mem_cgroup_end_page_stat - finish a page state statistics transaction |
2192 | * @memcg: the memcg that was accounted against | 2192 | * @memcg: the memcg that was accounted against |
2193 | * @locked: value received from mem_cgroup_begin_page_stat() | 2193 | * @locked: value received from mem_cgroup_begin_page_stat() |
2194 | * @flags: value received from mem_cgroup_begin_page_stat() | 2194 | * @flags: value received from mem_cgroup_begin_page_stat() |
2195 | */ | 2195 | */ |
2196 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, | 2196 | void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked, |
2197 | unsigned long flags) | 2197 | unsigned long flags) |
2198 | { | 2198 | { |
2199 | if (memcg && locked) | 2199 | if (memcg && locked) |
2200 | move_unlock_mem_cgroup(memcg, &flags); | 2200 | move_unlock_mem_cgroup(memcg, &flags); |
2201 | 2201 | ||
2202 | rcu_read_unlock(); | 2202 | rcu_read_unlock(); |
2203 | } | 2203 | } |
2204 | 2204 | ||
2205 | /** | 2205 | /** |
2206 | * mem_cgroup_update_page_stat - update page state statistics | 2206 | * mem_cgroup_update_page_stat - update page state statistics |
2207 | * @memcg: memcg to account against | 2207 | * @memcg: memcg to account against |
2208 | * @idx: page state item to account | 2208 | * @idx: page state item to account |
2209 | * @val: number of pages (positive or negative) | 2209 | * @val: number of pages (positive or negative) |
2210 | * | 2210 | * |
2211 | * See mem_cgroup_begin_page_stat() for locking requirements. | 2211 | * See mem_cgroup_begin_page_stat() for locking requirements. |
2212 | */ | 2212 | */ |
2213 | void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, | 2213 | void mem_cgroup_update_page_stat(struct mem_cgroup *memcg, |
2214 | enum mem_cgroup_stat_index idx, int val) | 2214 | enum mem_cgroup_stat_index idx, int val) |
2215 | { | 2215 | { |
2216 | VM_BUG_ON(!rcu_read_lock_held()); | 2216 | VM_BUG_ON(!rcu_read_lock_held()); |
2217 | 2217 | ||
2218 | if (memcg) | 2218 | if (memcg) |
2219 | this_cpu_add(memcg->stat->count[idx], val); | 2219 | this_cpu_add(memcg->stat->count[idx], val); |
2220 | } | 2220 | } |
2221 | 2221 | ||
2222 | /* | 2222 | /* |
2223 | * size of first charge trial. "32" comes from vmscan.c's magic value. | 2223 | * size of first charge trial. "32" comes from vmscan.c's magic value. |
2224 | * TODO: maybe necessary to use big numbers in big irons. | 2224 | * TODO: maybe necessary to use big numbers in big irons. |
2225 | */ | 2225 | */ |
2226 | #define CHARGE_BATCH 32U | 2226 | #define CHARGE_BATCH 32U |
2227 | struct memcg_stock_pcp { | 2227 | struct memcg_stock_pcp { |
2228 | struct mem_cgroup *cached; /* this never be root cgroup */ | 2228 | struct mem_cgroup *cached; /* this never be root cgroup */ |
2229 | unsigned int nr_pages; | 2229 | unsigned int nr_pages; |
2230 | struct work_struct work; | 2230 | struct work_struct work; |
2231 | unsigned long flags; | 2231 | unsigned long flags; |
2232 | #define FLUSHING_CACHED_CHARGE 0 | 2232 | #define FLUSHING_CACHED_CHARGE 0 |
2233 | }; | 2233 | }; |
2234 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | 2234 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); |
2235 | static DEFINE_MUTEX(percpu_charge_mutex); | 2235 | static DEFINE_MUTEX(percpu_charge_mutex); |
2236 | 2236 | ||
2237 | /** | 2237 | /** |
2238 | * consume_stock: Try to consume stocked charge on this cpu. | 2238 | * consume_stock: Try to consume stocked charge on this cpu. |
2239 | * @memcg: memcg to consume from. | 2239 | * @memcg: memcg to consume from. |
2240 | * @nr_pages: how many pages to charge. | 2240 | * @nr_pages: how many pages to charge. |
2241 | * | 2241 | * |
2242 | * The charges will only happen if @memcg matches the current cpu's memcg | 2242 | * The charges will only happen if @memcg matches the current cpu's memcg |
2243 | * stock, and at least @nr_pages are available in that stock. Failure to | 2243 | * stock, and at least @nr_pages are available in that stock. Failure to |
2244 | * service an allocation will refill the stock. | 2244 | * service an allocation will refill the stock. |
2245 | * | 2245 | * |
2246 | * returns true if successful, false otherwise. | 2246 | * returns true if successful, false otherwise. |
2247 | */ | 2247 | */ |
2248 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 2248 | static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2249 | { | 2249 | { |
2250 | struct memcg_stock_pcp *stock; | 2250 | struct memcg_stock_pcp *stock; |
2251 | bool ret = false; | 2251 | bool ret = false; |
2252 | 2252 | ||
2253 | if (nr_pages > CHARGE_BATCH) | 2253 | if (nr_pages > CHARGE_BATCH) |
2254 | return ret; | 2254 | return ret; |
2255 | 2255 | ||
2256 | stock = &get_cpu_var(memcg_stock); | 2256 | stock = &get_cpu_var(memcg_stock); |
2257 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) { | 2257 | if (memcg == stock->cached && stock->nr_pages >= nr_pages) { |
2258 | stock->nr_pages -= nr_pages; | 2258 | stock->nr_pages -= nr_pages; |
2259 | ret = true; | 2259 | ret = true; |
2260 | } | 2260 | } |
2261 | put_cpu_var(memcg_stock); | 2261 | put_cpu_var(memcg_stock); |
2262 | return ret; | 2262 | return ret; |
2263 | } | 2263 | } |
2264 | 2264 | ||
2265 | /* | 2265 | /* |
2266 | * Returns stocks cached in percpu and reset cached information. | 2266 | * Returns stocks cached in percpu and reset cached information. |
2267 | */ | 2267 | */ |
2268 | static void drain_stock(struct memcg_stock_pcp *stock) | 2268 | static void drain_stock(struct memcg_stock_pcp *stock) |
2269 | { | 2269 | { |
2270 | struct mem_cgroup *old = stock->cached; | 2270 | struct mem_cgroup *old = stock->cached; |
2271 | 2271 | ||
2272 | if (stock->nr_pages) { | 2272 | if (stock->nr_pages) { |
2273 | page_counter_uncharge(&old->memory, stock->nr_pages); | 2273 | page_counter_uncharge(&old->memory, stock->nr_pages); |
2274 | if (do_swap_account) | 2274 | if (do_swap_account) |
2275 | page_counter_uncharge(&old->memsw, stock->nr_pages); | 2275 | page_counter_uncharge(&old->memsw, stock->nr_pages); |
2276 | css_put_many(&old->css, stock->nr_pages); | ||
2276 | stock->nr_pages = 0; | 2277 | stock->nr_pages = 0; |
2277 | } | 2278 | } |
2278 | stock->cached = NULL; | 2279 | stock->cached = NULL; |
2279 | } | 2280 | } |
2280 | 2281 | ||
2281 | /* | 2282 | /* |
2282 | * This must be called under preempt disabled or must be called by | 2283 | * This must be called under preempt disabled or must be called by |
2283 | * a thread which is pinned to local cpu. | 2284 | * a thread which is pinned to local cpu. |
2284 | */ | 2285 | */ |
2285 | static void drain_local_stock(struct work_struct *dummy) | 2286 | static void drain_local_stock(struct work_struct *dummy) |
2286 | { | 2287 | { |
2287 | struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); | 2288 | struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); |
2288 | drain_stock(stock); | 2289 | drain_stock(stock); |
2289 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); | 2290 | clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); |
2290 | } | 2291 | } |
2291 | 2292 | ||
2292 | static void __init memcg_stock_init(void) | 2293 | static void __init memcg_stock_init(void) |
2293 | { | 2294 | { |
2294 | int cpu; | 2295 | int cpu; |
2295 | 2296 | ||
2296 | for_each_possible_cpu(cpu) { | 2297 | for_each_possible_cpu(cpu) { |
2297 | struct memcg_stock_pcp *stock = | 2298 | struct memcg_stock_pcp *stock = |
2298 | &per_cpu(memcg_stock, cpu); | 2299 | &per_cpu(memcg_stock, cpu); |
2299 | INIT_WORK(&stock->work, drain_local_stock); | 2300 | INIT_WORK(&stock->work, drain_local_stock); |
2300 | } | 2301 | } |
2301 | } | 2302 | } |
2302 | 2303 | ||
2303 | /* | 2304 | /* |
2304 | * Cache charges(val) to local per_cpu area. | 2305 | * Cache charges(val) to local per_cpu area. |
2305 | * This will be consumed by consume_stock() function, later. | 2306 | * This will be consumed by consume_stock() function, later. |
2306 | */ | 2307 | */ |
2307 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) | 2308 | static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) |
2308 | { | 2309 | { |
2309 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | 2310 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); |
2310 | 2311 | ||
2311 | if (stock->cached != memcg) { /* reset if necessary */ | 2312 | if (stock->cached != memcg) { /* reset if necessary */ |
2312 | drain_stock(stock); | 2313 | drain_stock(stock); |
2313 | stock->cached = memcg; | 2314 | stock->cached = memcg; |
2314 | } | 2315 | } |
2315 | stock->nr_pages += nr_pages; | 2316 | stock->nr_pages += nr_pages; |
2316 | put_cpu_var(memcg_stock); | 2317 | put_cpu_var(memcg_stock); |
2317 | } | 2318 | } |
2318 | 2319 | ||
2319 | /* | 2320 | /* |
2320 | * Drains all per-CPU charge caches for given root_memcg resp. subtree | 2321 | * Drains all per-CPU charge caches for given root_memcg resp. subtree |
2321 | * of the hierarchy under it. sync flag says whether we should block | 2322 | * of the hierarchy under it. sync flag says whether we should block |
2322 | * until the work is done. | 2323 | * until the work is done. |
2323 | */ | 2324 | */ |
2324 | static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) | 2325 | static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) |
2325 | { | 2326 | { |
2326 | int cpu, curcpu; | 2327 | int cpu, curcpu; |
2327 | 2328 | ||
2328 | /* Notify other cpus that system-wide "drain" is running */ | 2329 | /* Notify other cpus that system-wide "drain" is running */ |
2329 | get_online_cpus(); | 2330 | get_online_cpus(); |
2330 | curcpu = get_cpu(); | 2331 | curcpu = get_cpu(); |
2331 | for_each_online_cpu(cpu) { | 2332 | for_each_online_cpu(cpu) { |
2332 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2333 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2333 | struct mem_cgroup *memcg; | 2334 | struct mem_cgroup *memcg; |
2334 | 2335 | ||
2335 | memcg = stock->cached; | 2336 | memcg = stock->cached; |
2336 | if (!memcg || !stock->nr_pages) | 2337 | if (!memcg || !stock->nr_pages) |
2337 | continue; | 2338 | continue; |
2338 | if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) | 2339 | if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) |
2339 | continue; | 2340 | continue; |
2340 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { | 2341 | if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { |
2341 | if (cpu == curcpu) | 2342 | if (cpu == curcpu) |
2342 | drain_local_stock(&stock->work); | 2343 | drain_local_stock(&stock->work); |
2343 | else | 2344 | else |
2344 | schedule_work_on(cpu, &stock->work); | 2345 | schedule_work_on(cpu, &stock->work); |
2345 | } | 2346 | } |
2346 | } | 2347 | } |
2347 | put_cpu(); | 2348 | put_cpu(); |
2348 | 2349 | ||
2349 | if (!sync) | 2350 | if (!sync) |
2350 | goto out; | 2351 | goto out; |
2351 | 2352 | ||
2352 | for_each_online_cpu(cpu) { | 2353 | for_each_online_cpu(cpu) { |
2353 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | 2354 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); |
2354 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) | 2355 | if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) |
2355 | flush_work(&stock->work); | 2356 | flush_work(&stock->work); |
2356 | } | 2357 | } |
2357 | out: | 2358 | out: |
2358 | put_online_cpus(); | 2359 | put_online_cpus(); |
2359 | } | 2360 | } |
2360 | 2361 | ||
2361 | /* | 2362 | /* |
2362 | * Tries to drain stocked charges in other cpus. This function is asynchronous | 2363 | * Tries to drain stocked charges in other cpus. This function is asynchronous |
2363 | * and just put a work per cpu for draining localy on each cpu. Caller can | 2364 | * and just put a work per cpu for draining localy on each cpu. Caller can |
2364 | * expects some charges will be back later but cannot wait for it. | 2365 | * expects some charges will be back later but cannot wait for it. |
2365 | */ | 2366 | */ |
2366 | static void drain_all_stock_async(struct mem_cgroup *root_memcg) | 2367 | static void drain_all_stock_async(struct mem_cgroup *root_memcg) |
2367 | { | 2368 | { |
2368 | /* | 2369 | /* |
2369 | * If someone calls draining, avoid adding more kworker runs. | 2370 | * If someone calls draining, avoid adding more kworker runs. |
2370 | */ | 2371 | */ |
2371 | if (!mutex_trylock(&percpu_charge_mutex)) | 2372 | if (!mutex_trylock(&percpu_charge_mutex)) |
2372 | return; | 2373 | return; |
2373 | drain_all_stock(root_memcg, false); | 2374 | drain_all_stock(root_memcg, false); |
2374 | mutex_unlock(&percpu_charge_mutex); | 2375 | mutex_unlock(&percpu_charge_mutex); |
2375 | } | 2376 | } |
2376 | 2377 | ||
2377 | /* This is a synchronous drain interface. */ | 2378 | /* This is a synchronous drain interface. */ |
2378 | static void drain_all_stock_sync(struct mem_cgroup *root_memcg) | 2379 | static void drain_all_stock_sync(struct mem_cgroup *root_memcg) |
2379 | { | 2380 | { |
2380 | /* called when force_empty is called */ | 2381 | /* called when force_empty is called */ |
2381 | mutex_lock(&percpu_charge_mutex); | 2382 | mutex_lock(&percpu_charge_mutex); |
2382 | drain_all_stock(root_memcg, true); | 2383 | drain_all_stock(root_memcg, true); |
2383 | mutex_unlock(&percpu_charge_mutex); | 2384 | mutex_unlock(&percpu_charge_mutex); |
2384 | } | 2385 | } |
2385 | 2386 | ||
2386 | /* | 2387 | /* |
2387 | * This function drains percpu counter value from DEAD cpu and | 2388 | * This function drains percpu counter value from DEAD cpu and |
2388 | * move it to local cpu. Note that this function can be preempted. | 2389 | * move it to local cpu. Note that this function can be preempted. |
2389 | */ | 2390 | */ |
2390 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) | 2391 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) |
2391 | { | 2392 | { |
2392 | int i; | 2393 | int i; |
2393 | 2394 | ||
2394 | spin_lock(&memcg->pcp_counter_lock); | 2395 | spin_lock(&memcg->pcp_counter_lock); |
2395 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 2396 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
2396 | long x = per_cpu(memcg->stat->count[i], cpu); | 2397 | long x = per_cpu(memcg->stat->count[i], cpu); |
2397 | 2398 | ||
2398 | per_cpu(memcg->stat->count[i], cpu) = 0; | 2399 | per_cpu(memcg->stat->count[i], cpu) = 0; |
2399 | memcg->nocpu_base.count[i] += x; | 2400 | memcg->nocpu_base.count[i] += x; |
2400 | } | 2401 | } |
2401 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | 2402 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { |
2402 | unsigned long x = per_cpu(memcg->stat->events[i], cpu); | 2403 | unsigned long x = per_cpu(memcg->stat->events[i], cpu); |
2403 | 2404 | ||
2404 | per_cpu(memcg->stat->events[i], cpu) = 0; | 2405 | per_cpu(memcg->stat->events[i], cpu) = 0; |
2405 | memcg->nocpu_base.events[i] += x; | 2406 | memcg->nocpu_base.events[i] += x; |
2406 | } | 2407 | } |
2407 | spin_unlock(&memcg->pcp_counter_lock); | 2408 | spin_unlock(&memcg->pcp_counter_lock); |
2408 | } | 2409 | } |
2409 | 2410 | ||
2410 | static int memcg_cpu_hotplug_callback(struct notifier_block *nb, | 2411 | static int memcg_cpu_hotplug_callback(struct notifier_block *nb, |
2411 | unsigned long action, | 2412 | unsigned long action, |
2412 | void *hcpu) | 2413 | void *hcpu) |
2413 | { | 2414 | { |
2414 | int cpu = (unsigned long)hcpu; | 2415 | int cpu = (unsigned long)hcpu; |
2415 | struct memcg_stock_pcp *stock; | 2416 | struct memcg_stock_pcp *stock; |
2416 | struct mem_cgroup *iter; | 2417 | struct mem_cgroup *iter; |
2417 | 2418 | ||
2418 | if (action == CPU_ONLINE) | 2419 | if (action == CPU_ONLINE) |
2419 | return NOTIFY_OK; | 2420 | return NOTIFY_OK; |
2420 | 2421 | ||
2421 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) | 2422 | if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) |
2422 | return NOTIFY_OK; | 2423 | return NOTIFY_OK; |
2423 | 2424 | ||
2424 | for_each_mem_cgroup(iter) | 2425 | for_each_mem_cgroup(iter) |
2425 | mem_cgroup_drain_pcp_counter(iter, cpu); | 2426 | mem_cgroup_drain_pcp_counter(iter, cpu); |
2426 | 2427 | ||
2427 | stock = &per_cpu(memcg_stock, cpu); | 2428 | stock = &per_cpu(memcg_stock, cpu); |
2428 | drain_stock(stock); | 2429 | drain_stock(stock); |
2429 | return NOTIFY_OK; | 2430 | return NOTIFY_OK; |
2430 | } | 2431 | } |
2431 | 2432 | ||
2432 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2433 | static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2433 | unsigned int nr_pages) | 2434 | unsigned int nr_pages) |
2434 | { | 2435 | { |
2435 | unsigned int batch = max(CHARGE_BATCH, nr_pages); | 2436 | unsigned int batch = max(CHARGE_BATCH, nr_pages); |
2436 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 2437 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
2437 | struct mem_cgroup *mem_over_limit; | 2438 | struct mem_cgroup *mem_over_limit; |
2438 | struct page_counter *counter; | 2439 | struct page_counter *counter; |
2439 | unsigned long nr_reclaimed; | 2440 | unsigned long nr_reclaimed; |
2440 | bool may_swap = true; | 2441 | bool may_swap = true; |
2441 | bool drained = false; | 2442 | bool drained = false; |
2442 | int ret = 0; | 2443 | int ret = 0; |
2443 | 2444 | ||
2444 | if (mem_cgroup_is_root(memcg)) | 2445 | if (mem_cgroup_is_root(memcg)) |
2445 | goto done; | 2446 | goto done; |
2446 | retry: | 2447 | retry: |
2447 | if (consume_stock(memcg, nr_pages)) | 2448 | if (consume_stock(memcg, nr_pages)) |
2448 | goto done; | 2449 | goto done; |
2449 | 2450 | ||
2450 | if (!do_swap_account || | 2451 | if (!do_swap_account || |
2451 | !page_counter_try_charge(&memcg->memsw, batch, &counter)) { | 2452 | !page_counter_try_charge(&memcg->memsw, batch, &counter)) { |
2452 | if (!page_counter_try_charge(&memcg->memory, batch, &counter)) | 2453 | if (!page_counter_try_charge(&memcg->memory, batch, &counter)) |
2453 | goto done_restock; | 2454 | goto done_restock; |
2454 | if (do_swap_account) | 2455 | if (do_swap_account) |
2455 | page_counter_uncharge(&memcg->memsw, batch); | 2456 | page_counter_uncharge(&memcg->memsw, batch); |
2456 | mem_over_limit = mem_cgroup_from_counter(counter, memory); | 2457 | mem_over_limit = mem_cgroup_from_counter(counter, memory); |
2457 | } else { | 2458 | } else { |
2458 | mem_over_limit = mem_cgroup_from_counter(counter, memsw); | 2459 | mem_over_limit = mem_cgroup_from_counter(counter, memsw); |
2459 | may_swap = false; | 2460 | may_swap = false; |
2460 | } | 2461 | } |
2461 | 2462 | ||
2462 | if (batch > nr_pages) { | 2463 | if (batch > nr_pages) { |
2463 | batch = nr_pages; | 2464 | batch = nr_pages; |
2464 | goto retry; | 2465 | goto retry; |
2465 | } | 2466 | } |
2466 | 2467 | ||
2467 | /* | 2468 | /* |
2468 | * Unlike in global OOM situations, memcg is not in a physical | 2469 | * Unlike in global OOM situations, memcg is not in a physical |
2469 | * memory shortage. Allow dying and OOM-killed tasks to | 2470 | * memory shortage. Allow dying and OOM-killed tasks to |
2470 | * bypass the last charges so that they can exit quickly and | 2471 | * bypass the last charges so that they can exit quickly and |
2471 | * free their memory. | 2472 | * free their memory. |
2472 | */ | 2473 | */ |
2473 | if (unlikely(test_thread_flag(TIF_MEMDIE) || | 2474 | if (unlikely(test_thread_flag(TIF_MEMDIE) || |
2474 | fatal_signal_pending(current) || | 2475 | fatal_signal_pending(current) || |
2475 | current->flags & PF_EXITING)) | 2476 | current->flags & PF_EXITING)) |
2476 | goto bypass; | 2477 | goto bypass; |
2477 | 2478 | ||
2478 | if (unlikely(task_in_memcg_oom(current))) | 2479 | if (unlikely(task_in_memcg_oom(current))) |
2479 | goto nomem; | 2480 | goto nomem; |
2480 | 2481 | ||
2481 | if (!(gfp_mask & __GFP_WAIT)) | 2482 | if (!(gfp_mask & __GFP_WAIT)) |
2482 | goto nomem; | 2483 | goto nomem; |
2483 | 2484 | ||
2484 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, | 2485 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, |
2485 | gfp_mask, may_swap); | 2486 | gfp_mask, may_swap); |
2486 | 2487 | ||
2487 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2488 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2488 | goto retry; | 2489 | goto retry; |
2489 | 2490 | ||
2490 | if (!drained) { | 2491 | if (!drained) { |
2491 | drain_all_stock_async(mem_over_limit); | 2492 | drain_all_stock_async(mem_over_limit); |
2492 | drained = true; | 2493 | drained = true; |
2493 | goto retry; | 2494 | goto retry; |
2494 | } | 2495 | } |
2495 | 2496 | ||
2496 | if (gfp_mask & __GFP_NORETRY) | 2497 | if (gfp_mask & __GFP_NORETRY) |
2497 | goto nomem; | 2498 | goto nomem; |
2498 | /* | 2499 | /* |
2499 | * Even though the limit is exceeded at this point, reclaim | 2500 | * Even though the limit is exceeded at this point, reclaim |
2500 | * may have been able to free some pages. Retry the charge | 2501 | * may have been able to free some pages. Retry the charge |
2501 | * before killing the task. | 2502 | * before killing the task. |
2502 | * | 2503 | * |
2503 | * Only for regular pages, though: huge pages are rather | 2504 | * Only for regular pages, though: huge pages are rather |
2504 | * unlikely to succeed so close to the limit, and we fall back | 2505 | * unlikely to succeed so close to the limit, and we fall back |
2505 | * to regular pages anyway in case of failure. | 2506 | * to regular pages anyway in case of failure. |
2506 | */ | 2507 | */ |
2507 | if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) | 2508 | if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) |
2508 | goto retry; | 2509 | goto retry; |
2509 | /* | 2510 | /* |
2510 | * At task move, charge accounts can be doubly counted. So, it's | 2511 | * At task move, charge accounts can be doubly counted. So, it's |
2511 | * better to wait until the end of task_move if something is going on. | 2512 | * better to wait until the end of task_move if something is going on. |
2512 | */ | 2513 | */ |
2513 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | 2514 | if (mem_cgroup_wait_acct_move(mem_over_limit)) |
2514 | goto retry; | 2515 | goto retry; |
2515 | 2516 | ||
2516 | if (nr_retries--) | 2517 | if (nr_retries--) |
2517 | goto retry; | 2518 | goto retry; |
2518 | 2519 | ||
2519 | if (gfp_mask & __GFP_NOFAIL) | 2520 | if (gfp_mask & __GFP_NOFAIL) |
2520 | goto bypass; | 2521 | goto bypass; |
2521 | 2522 | ||
2522 | if (fatal_signal_pending(current)) | 2523 | if (fatal_signal_pending(current)) |
2523 | goto bypass; | 2524 | goto bypass; |
2524 | 2525 | ||
2525 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); | 2526 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages)); |
2526 | nomem: | 2527 | nomem: |
2527 | if (!(gfp_mask & __GFP_NOFAIL)) | 2528 | if (!(gfp_mask & __GFP_NOFAIL)) |
2528 | return -ENOMEM; | 2529 | return -ENOMEM; |
2529 | bypass: | 2530 | bypass: |
2530 | return -EINTR; | 2531 | return -EINTR; |
2531 | 2532 | ||
2532 | done_restock: | 2533 | done_restock: |
2534 | css_get_many(&memcg->css, batch); | ||
2533 | if (batch > nr_pages) | 2535 | if (batch > nr_pages) |
2534 | refill_stock(memcg, batch - nr_pages); | 2536 | refill_stock(memcg, batch - nr_pages); |
2535 | done: | 2537 | done: |
2536 | return ret; | 2538 | return ret; |
2537 | } | 2539 | } |
2538 | 2540 | ||
2539 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | 2541 | static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) |
2540 | { | 2542 | { |
2541 | if (mem_cgroup_is_root(memcg)) | 2543 | if (mem_cgroup_is_root(memcg)) |
2542 | return; | 2544 | return; |
2543 | 2545 | ||
2544 | page_counter_uncharge(&memcg->memory, nr_pages); | 2546 | page_counter_uncharge(&memcg->memory, nr_pages); |
2545 | if (do_swap_account) | 2547 | if (do_swap_account) |
2546 | page_counter_uncharge(&memcg->memsw, nr_pages); | 2548 | page_counter_uncharge(&memcg->memsw, nr_pages); |
2549 | |||
2550 | css_put_many(&memcg->css, nr_pages); | ||
2547 | } | 2551 | } |
2548 | 2552 | ||
2549 | /* | 2553 | /* |
2550 | * A helper function to get mem_cgroup from ID. must be called under | 2554 | * A helper function to get mem_cgroup from ID. must be called under |
2551 | * rcu_read_lock(). The caller is responsible for calling | 2555 | * rcu_read_lock(). The caller is responsible for calling |
2552 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | 2556 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping |
2553 | * refcnt from swap can be called against removed memcg.) | 2557 | * refcnt from swap can be called against removed memcg.) |
2554 | */ | 2558 | */ |
2555 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | 2559 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) |
2556 | { | 2560 | { |
2557 | /* ID 0 is unused ID */ | 2561 | /* ID 0 is unused ID */ |
2558 | if (!id) | 2562 | if (!id) |
2559 | return NULL; | 2563 | return NULL; |
2560 | return mem_cgroup_from_id(id); | 2564 | return mem_cgroup_from_id(id); |
2561 | } | 2565 | } |
2562 | 2566 | ||
2563 | /* | 2567 | /* |
2564 | * try_get_mem_cgroup_from_page - look up page's memcg association | 2568 | * try_get_mem_cgroup_from_page - look up page's memcg association |
2565 | * @page: the page | 2569 | * @page: the page |
2566 | * | 2570 | * |
2567 | * Look up, get a css reference, and return the memcg that owns @page. | 2571 | * Look up, get a css reference, and return the memcg that owns @page. |
2568 | * | 2572 | * |
2569 | * The page must be locked to prevent racing with swap-in and page | 2573 | * The page must be locked to prevent racing with swap-in and page |
2570 | * cache charges. If coming from an unlocked page table, the caller | 2574 | * cache charges. If coming from an unlocked page table, the caller |
2571 | * must ensure the page is on the LRU or this can race with charging. | 2575 | * must ensure the page is on the LRU or this can race with charging. |
2572 | */ | 2576 | */ |
2573 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | 2577 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
2574 | { | 2578 | { |
2575 | struct mem_cgroup *memcg = NULL; | 2579 | struct mem_cgroup *memcg = NULL; |
2576 | struct page_cgroup *pc; | 2580 | struct page_cgroup *pc; |
2577 | unsigned short id; | 2581 | unsigned short id; |
2578 | swp_entry_t ent; | 2582 | swp_entry_t ent; |
2579 | 2583 | ||
2580 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 2584 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
2581 | 2585 | ||
2582 | pc = lookup_page_cgroup(page); | 2586 | pc = lookup_page_cgroup(page); |
2583 | if (PageCgroupUsed(pc)) { | 2587 | if (PageCgroupUsed(pc)) { |
2584 | memcg = pc->mem_cgroup; | 2588 | memcg = pc->mem_cgroup; |
2585 | if (memcg && !css_tryget_online(&memcg->css)) | 2589 | if (memcg && !css_tryget_online(&memcg->css)) |
2586 | memcg = NULL; | 2590 | memcg = NULL; |
2587 | } else if (PageSwapCache(page)) { | 2591 | } else if (PageSwapCache(page)) { |
2588 | ent.val = page_private(page); | 2592 | ent.val = page_private(page); |
2589 | id = lookup_swap_cgroup_id(ent); | 2593 | id = lookup_swap_cgroup_id(ent); |
2590 | rcu_read_lock(); | 2594 | rcu_read_lock(); |
2591 | memcg = mem_cgroup_lookup(id); | 2595 | memcg = mem_cgroup_lookup(id); |
2592 | if (memcg && !css_tryget_online(&memcg->css)) | 2596 | if (memcg && !css_tryget_online(&memcg->css)) |
2593 | memcg = NULL; | 2597 | memcg = NULL; |
2594 | rcu_read_unlock(); | 2598 | rcu_read_unlock(); |
2595 | } | 2599 | } |
2596 | return memcg; | 2600 | return memcg; |
2597 | } | 2601 | } |
2598 | 2602 | ||
2599 | static void lock_page_lru(struct page *page, int *isolated) | 2603 | static void lock_page_lru(struct page *page, int *isolated) |
2600 | { | 2604 | { |
2601 | struct zone *zone = page_zone(page); | 2605 | struct zone *zone = page_zone(page); |
2602 | 2606 | ||
2603 | spin_lock_irq(&zone->lru_lock); | 2607 | spin_lock_irq(&zone->lru_lock); |
2604 | if (PageLRU(page)) { | 2608 | if (PageLRU(page)) { |
2605 | struct lruvec *lruvec; | 2609 | struct lruvec *lruvec; |
2606 | 2610 | ||
2607 | lruvec = mem_cgroup_page_lruvec(page, zone); | 2611 | lruvec = mem_cgroup_page_lruvec(page, zone); |
2608 | ClearPageLRU(page); | 2612 | ClearPageLRU(page); |
2609 | del_page_from_lru_list(page, lruvec, page_lru(page)); | 2613 | del_page_from_lru_list(page, lruvec, page_lru(page)); |
2610 | *isolated = 1; | 2614 | *isolated = 1; |
2611 | } else | 2615 | } else |
2612 | *isolated = 0; | 2616 | *isolated = 0; |
2613 | } | 2617 | } |
2614 | 2618 | ||
2615 | static void unlock_page_lru(struct page *page, int isolated) | 2619 | static void unlock_page_lru(struct page *page, int isolated) |
2616 | { | 2620 | { |
2617 | struct zone *zone = page_zone(page); | 2621 | struct zone *zone = page_zone(page); |
2618 | 2622 | ||
2619 | if (isolated) { | 2623 | if (isolated) { |
2620 | struct lruvec *lruvec; | 2624 | struct lruvec *lruvec; |
2621 | 2625 | ||
2622 | lruvec = mem_cgroup_page_lruvec(page, zone); | 2626 | lruvec = mem_cgroup_page_lruvec(page, zone); |
2623 | VM_BUG_ON_PAGE(PageLRU(page), page); | 2627 | VM_BUG_ON_PAGE(PageLRU(page), page); |
2624 | SetPageLRU(page); | 2628 | SetPageLRU(page); |
2625 | add_page_to_lru_list(page, lruvec, page_lru(page)); | 2629 | add_page_to_lru_list(page, lruvec, page_lru(page)); |
2626 | } | 2630 | } |
2627 | spin_unlock_irq(&zone->lru_lock); | 2631 | spin_unlock_irq(&zone->lru_lock); |
2628 | } | 2632 | } |
2629 | 2633 | ||
2630 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, | 2634 | static void commit_charge(struct page *page, struct mem_cgroup *memcg, |
2631 | bool lrucare) | 2635 | bool lrucare) |
2632 | { | 2636 | { |
2633 | struct page_cgroup *pc = lookup_page_cgroup(page); | 2637 | struct page_cgroup *pc = lookup_page_cgroup(page); |
2634 | int isolated; | 2638 | int isolated; |
2635 | 2639 | ||
2636 | VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); | 2640 | VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); |
2637 | /* | 2641 | /* |
2638 | * we don't need page_cgroup_lock about tail pages, becase they are not | 2642 | * we don't need page_cgroup_lock about tail pages, becase they are not |
2639 | * accessed by any other context at this point. | 2643 | * accessed by any other context at this point. |
2640 | */ | 2644 | */ |
2641 | 2645 | ||
2642 | /* | 2646 | /* |
2643 | * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page | 2647 | * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page |
2644 | * may already be on some other mem_cgroup's LRU. Take care of it. | 2648 | * may already be on some other mem_cgroup's LRU. Take care of it. |
2645 | */ | 2649 | */ |
2646 | if (lrucare) | 2650 | if (lrucare) |
2647 | lock_page_lru(page, &isolated); | 2651 | lock_page_lru(page, &isolated); |
2648 | 2652 | ||
2649 | /* | 2653 | /* |
2650 | * Nobody should be changing or seriously looking at | 2654 | * Nobody should be changing or seriously looking at |
2651 | * pc->mem_cgroup and pc->flags at this point: | 2655 | * pc->mem_cgroup and pc->flags at this point: |
2652 | * | 2656 | * |
2653 | * - the page is uncharged | 2657 | * - the page is uncharged |
2654 | * | 2658 | * |
2655 | * - the page is off-LRU | 2659 | * - the page is off-LRU |
2656 | * | 2660 | * |
2657 | * - an anonymous fault has exclusive page access, except for | 2661 | * - an anonymous fault has exclusive page access, except for |
2658 | * a locked page table | 2662 | * a locked page table |
2659 | * | 2663 | * |
2660 | * - a page cache insertion, a swapin fault, or a migration | 2664 | * - a page cache insertion, a swapin fault, or a migration |
2661 | * have the page locked | 2665 | * have the page locked |
2662 | */ | 2666 | */ |
2663 | pc->mem_cgroup = memcg; | 2667 | pc->mem_cgroup = memcg; |
2664 | pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); | 2668 | pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); |
2665 | 2669 | ||
2666 | if (lrucare) | 2670 | if (lrucare) |
2667 | unlock_page_lru(page, isolated); | 2671 | unlock_page_lru(page, isolated); |
2668 | } | 2672 | } |
2669 | 2673 | ||
2670 | #ifdef CONFIG_MEMCG_KMEM | 2674 | #ifdef CONFIG_MEMCG_KMEM |
2671 | /* | 2675 | /* |
2672 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or | 2676 | * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or |
2673 | * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. | 2677 | * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. |
2674 | */ | 2678 | */ |
2675 | static DEFINE_MUTEX(memcg_slab_mutex); | 2679 | static DEFINE_MUTEX(memcg_slab_mutex); |
2676 | 2680 | ||
2677 | static DEFINE_MUTEX(activate_kmem_mutex); | 2681 | static DEFINE_MUTEX(activate_kmem_mutex); |
2678 | 2682 | ||
2679 | /* | 2683 | /* |
2680 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | 2684 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer |
2681 | * in the memcg_cache_params struct. | 2685 | * in the memcg_cache_params struct. |
2682 | */ | 2686 | */ |
2683 | static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) | 2687 | static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) |
2684 | { | 2688 | { |
2685 | struct kmem_cache *cachep; | 2689 | struct kmem_cache *cachep; |
2686 | 2690 | ||
2687 | VM_BUG_ON(p->is_root_cache); | 2691 | VM_BUG_ON(p->is_root_cache); |
2688 | cachep = p->root_cache; | 2692 | cachep = p->root_cache; |
2689 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); | 2693 | return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); |
2690 | } | 2694 | } |
2691 | 2695 | ||
2692 | #ifdef CONFIG_SLABINFO | 2696 | #ifdef CONFIG_SLABINFO |
2693 | static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | 2697 | static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) |
2694 | { | 2698 | { |
2695 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 2699 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
2696 | struct memcg_cache_params *params; | 2700 | struct memcg_cache_params *params; |
2697 | 2701 | ||
2698 | if (!memcg_kmem_is_active(memcg)) | 2702 | if (!memcg_kmem_is_active(memcg)) |
2699 | return -EIO; | 2703 | return -EIO; |
2700 | 2704 | ||
2701 | print_slabinfo_header(m); | 2705 | print_slabinfo_header(m); |
2702 | 2706 | ||
2703 | mutex_lock(&memcg_slab_mutex); | 2707 | mutex_lock(&memcg_slab_mutex); |
2704 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) | 2708 | list_for_each_entry(params, &memcg->memcg_slab_caches, list) |
2705 | cache_show(memcg_params_to_cache(params), m); | 2709 | cache_show(memcg_params_to_cache(params), m); |
2706 | mutex_unlock(&memcg_slab_mutex); | 2710 | mutex_unlock(&memcg_slab_mutex); |
2707 | 2711 | ||
2708 | return 0; | 2712 | return 0; |
2709 | } | 2713 | } |
2710 | #endif | 2714 | #endif |
2711 | 2715 | ||
2712 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, | 2716 | static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, |
2713 | unsigned long nr_pages) | 2717 | unsigned long nr_pages) |
2714 | { | 2718 | { |
2715 | struct page_counter *counter; | 2719 | struct page_counter *counter; |
2716 | int ret = 0; | 2720 | int ret = 0; |
2717 | 2721 | ||
2718 | ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); | 2722 | ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter); |
2719 | if (ret < 0) | 2723 | if (ret < 0) |
2720 | return ret; | 2724 | return ret; |
2721 | 2725 | ||
2722 | ret = try_charge(memcg, gfp, nr_pages); | 2726 | ret = try_charge(memcg, gfp, nr_pages); |
2723 | if (ret == -EINTR) { | 2727 | if (ret == -EINTR) { |
2724 | /* | 2728 | /* |
2725 | * try_charge() chose to bypass to root due to OOM kill or | 2729 | * try_charge() chose to bypass to root due to OOM kill or |
2726 | * fatal signal. Since our only options are to either fail | 2730 | * fatal signal. Since our only options are to either fail |
2727 | * the allocation or charge it to this cgroup, do it as a | 2731 | * the allocation or charge it to this cgroup, do it as a |
2728 | * temporary condition. But we can't fail. From a kmem/slab | 2732 | * temporary condition. But we can't fail. From a kmem/slab |
2729 | * perspective, the cache has already been selected, by | 2733 | * perspective, the cache has already been selected, by |
2730 | * mem_cgroup_kmem_get_cache(), so it is too late to change | 2734 | * mem_cgroup_kmem_get_cache(), so it is too late to change |
2731 | * our minds. | 2735 | * our minds. |
2732 | * | 2736 | * |
2733 | * This condition will only trigger if the task entered | 2737 | * This condition will only trigger if the task entered |
2734 | * memcg_charge_kmem in a sane state, but was OOM-killed | 2738 | * memcg_charge_kmem in a sane state, but was OOM-killed |
2735 | * during try_charge() above. Tasks that were already dying | 2739 | * during try_charge() above. Tasks that were already dying |
2736 | * when the allocation triggers should have been already | 2740 | * when the allocation triggers should have been already |
2737 | * directed to the root cgroup in memcontrol.h | 2741 | * directed to the root cgroup in memcontrol.h |
2738 | */ | 2742 | */ |
2739 | page_counter_charge(&memcg->memory, nr_pages); | 2743 | page_counter_charge(&memcg->memory, nr_pages); |
2740 | if (do_swap_account) | 2744 | if (do_swap_account) |
2741 | page_counter_charge(&memcg->memsw, nr_pages); | 2745 | page_counter_charge(&memcg->memsw, nr_pages); |
2746 | css_get_many(&memcg->css, nr_pages); | ||
2742 | ret = 0; | 2747 | ret = 0; |
2743 | } else if (ret) | 2748 | } else if (ret) |
2744 | page_counter_uncharge(&memcg->kmem, nr_pages); | 2749 | page_counter_uncharge(&memcg->kmem, nr_pages); |
2745 | 2750 | ||
2746 | return ret; | 2751 | return ret; |
2747 | } | 2752 | } |
2748 | 2753 | ||
2749 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, | 2754 | static void memcg_uncharge_kmem(struct mem_cgroup *memcg, |
2750 | unsigned long nr_pages) | 2755 | unsigned long nr_pages) |
2751 | { | 2756 | { |
2752 | page_counter_uncharge(&memcg->memory, nr_pages); | 2757 | page_counter_uncharge(&memcg->memory, nr_pages); |
2753 | if (do_swap_account) | 2758 | if (do_swap_account) |
2754 | page_counter_uncharge(&memcg->memsw, nr_pages); | 2759 | page_counter_uncharge(&memcg->memsw, nr_pages); |
2755 | 2760 | ||
2756 | /* Not down to 0 */ | 2761 | /* Not down to 0 */ |
2757 | if (page_counter_uncharge(&memcg->kmem, nr_pages)) | 2762 | if (page_counter_uncharge(&memcg->kmem, nr_pages)) { |
2763 | css_put_many(&memcg->css, nr_pages); | ||
2758 | return; | 2764 | return; |
2765 | } | ||
2759 | 2766 | ||
2760 | /* | 2767 | /* |
2761 | * Releases a reference taken in kmem_cgroup_css_offline in case | 2768 | * Releases a reference taken in kmem_cgroup_css_offline in case |
2762 | * this last uncharge is racing with the offlining code or it is | 2769 | * this last uncharge is racing with the offlining code or it is |
2763 | * outliving the memcg existence. | 2770 | * outliving the memcg existence. |
2764 | * | 2771 | * |
2765 | * The memory barrier imposed by test&clear is paired with the | 2772 | * The memory barrier imposed by test&clear is paired with the |
2766 | * explicit one in memcg_kmem_mark_dead(). | 2773 | * explicit one in memcg_kmem_mark_dead(). |
2767 | */ | 2774 | */ |
2768 | if (memcg_kmem_test_and_clear_dead(memcg)) | 2775 | if (memcg_kmem_test_and_clear_dead(memcg)) |
2769 | css_put(&memcg->css); | 2776 | css_put(&memcg->css); |
2777 | |||
2778 | css_put_many(&memcg->css, nr_pages); | ||
2770 | } | 2779 | } |
2771 | 2780 | ||
2772 | /* | 2781 | /* |
2773 | * helper for acessing a memcg's index. It will be used as an index in the | 2782 | * helper for acessing a memcg's index. It will be used as an index in the |
2774 | * child cache array in kmem_cache, and also to derive its name. This function | 2783 | * child cache array in kmem_cache, and also to derive its name. This function |
2775 | * will return -1 when this is not a kmem-limited memcg. | 2784 | * will return -1 when this is not a kmem-limited memcg. |
2776 | */ | 2785 | */ |
2777 | int memcg_cache_id(struct mem_cgroup *memcg) | 2786 | int memcg_cache_id(struct mem_cgroup *memcg) |
2778 | { | 2787 | { |
2779 | return memcg ? memcg->kmemcg_id : -1; | 2788 | return memcg ? memcg->kmemcg_id : -1; |
2780 | } | 2789 | } |
2781 | 2790 | ||
2782 | static int memcg_alloc_cache_id(void) | 2791 | static int memcg_alloc_cache_id(void) |
2783 | { | 2792 | { |
2784 | int id, size; | 2793 | int id, size; |
2785 | int err; | 2794 | int err; |
2786 | 2795 | ||
2787 | id = ida_simple_get(&kmem_limited_groups, | 2796 | id = ida_simple_get(&kmem_limited_groups, |
2788 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | 2797 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); |
2789 | if (id < 0) | 2798 | if (id < 0) |
2790 | return id; | 2799 | return id; |
2791 | 2800 | ||
2792 | if (id < memcg_limited_groups_array_size) | 2801 | if (id < memcg_limited_groups_array_size) |
2793 | return id; | 2802 | return id; |
2794 | 2803 | ||
2795 | /* | 2804 | /* |
2796 | * There's no space for the new id in memcg_caches arrays, | 2805 | * There's no space for the new id in memcg_caches arrays, |
2797 | * so we have to grow them. | 2806 | * so we have to grow them. |
2798 | */ | 2807 | */ |
2799 | 2808 | ||
2800 | size = 2 * (id + 1); | 2809 | size = 2 * (id + 1); |
2801 | if (size < MEMCG_CACHES_MIN_SIZE) | 2810 | if (size < MEMCG_CACHES_MIN_SIZE) |
2802 | size = MEMCG_CACHES_MIN_SIZE; | 2811 | size = MEMCG_CACHES_MIN_SIZE; |
2803 | else if (size > MEMCG_CACHES_MAX_SIZE) | 2812 | else if (size > MEMCG_CACHES_MAX_SIZE) |
2804 | size = MEMCG_CACHES_MAX_SIZE; | 2813 | size = MEMCG_CACHES_MAX_SIZE; |
2805 | 2814 | ||
2806 | mutex_lock(&memcg_slab_mutex); | 2815 | mutex_lock(&memcg_slab_mutex); |
2807 | err = memcg_update_all_caches(size); | 2816 | err = memcg_update_all_caches(size); |
2808 | mutex_unlock(&memcg_slab_mutex); | 2817 | mutex_unlock(&memcg_slab_mutex); |
2809 | 2818 | ||
2810 | if (err) { | 2819 | if (err) { |
2811 | ida_simple_remove(&kmem_limited_groups, id); | 2820 | ida_simple_remove(&kmem_limited_groups, id); |
2812 | return err; | 2821 | return err; |
2813 | } | 2822 | } |
2814 | return id; | 2823 | return id; |
2815 | } | 2824 | } |
2816 | 2825 | ||
2817 | static void memcg_free_cache_id(int id) | 2826 | static void memcg_free_cache_id(int id) |
2818 | { | 2827 | { |
2819 | ida_simple_remove(&kmem_limited_groups, id); | 2828 | ida_simple_remove(&kmem_limited_groups, id); |
2820 | } | 2829 | } |
2821 | 2830 | ||
2822 | /* | 2831 | /* |
2823 | * We should update the current array size iff all caches updates succeed. This | 2832 | * We should update the current array size iff all caches updates succeed. This |
2824 | * can only be done from the slab side. The slab mutex needs to be held when | 2833 | * can only be done from the slab side. The slab mutex needs to be held when |
2825 | * calling this. | 2834 | * calling this. |
2826 | */ | 2835 | */ |
2827 | void memcg_update_array_size(int num) | 2836 | void memcg_update_array_size(int num) |
2828 | { | 2837 | { |
2829 | memcg_limited_groups_array_size = num; | 2838 | memcg_limited_groups_array_size = num; |
2830 | } | 2839 | } |
2831 | 2840 | ||
2832 | static void memcg_register_cache(struct mem_cgroup *memcg, | 2841 | static void memcg_register_cache(struct mem_cgroup *memcg, |
2833 | struct kmem_cache *root_cache) | 2842 | struct kmem_cache *root_cache) |
2834 | { | 2843 | { |
2835 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by | 2844 | static char memcg_name_buf[NAME_MAX + 1]; /* protected by |
2836 | memcg_slab_mutex */ | 2845 | memcg_slab_mutex */ |
2837 | struct kmem_cache *cachep; | 2846 | struct kmem_cache *cachep; |
2838 | int id; | 2847 | int id; |
2839 | 2848 | ||
2840 | lockdep_assert_held(&memcg_slab_mutex); | 2849 | lockdep_assert_held(&memcg_slab_mutex); |
2841 | 2850 | ||
2842 | id = memcg_cache_id(memcg); | 2851 | id = memcg_cache_id(memcg); |
2843 | 2852 | ||
2844 | /* | 2853 | /* |
2845 | * Since per-memcg caches are created asynchronously on first | 2854 | * Since per-memcg caches are created asynchronously on first |
2846 | * allocation (see memcg_kmem_get_cache()), several threads can try to | 2855 | * allocation (see memcg_kmem_get_cache()), several threads can try to |
2847 | * create the same cache, but only one of them may succeed. | 2856 | * create the same cache, but only one of them may succeed. |
2848 | */ | 2857 | */ |
2849 | if (cache_from_memcg_idx(root_cache, id)) | 2858 | if (cache_from_memcg_idx(root_cache, id)) |
2850 | return; | 2859 | return; |
2851 | 2860 | ||
2852 | cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); | 2861 | cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); |
2853 | cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); | 2862 | cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); |
2854 | /* | 2863 | /* |
2855 | * If we could not create a memcg cache, do not complain, because | 2864 | * If we could not create a memcg cache, do not complain, because |
2856 | * that's not critical at all as we can always proceed with the root | 2865 | * that's not critical at all as we can always proceed with the root |
2857 | * cache. | 2866 | * cache. |
2858 | */ | 2867 | */ |
2859 | if (!cachep) | 2868 | if (!cachep) |
2860 | return; | 2869 | return; |
2861 | 2870 | ||
2862 | css_get(&memcg->css); | 2871 | css_get(&memcg->css); |
2863 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | 2872 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
2864 | 2873 | ||
2865 | /* | 2874 | /* |
2866 | * Since readers won't lock (see cache_from_memcg_idx()), we need a | 2875 | * Since readers won't lock (see cache_from_memcg_idx()), we need a |
2867 | * barrier here to ensure nobody will see the kmem_cache partially | 2876 | * barrier here to ensure nobody will see the kmem_cache partially |
2868 | * initialized. | 2877 | * initialized. |
2869 | */ | 2878 | */ |
2870 | smp_wmb(); | 2879 | smp_wmb(); |
2871 | 2880 | ||
2872 | BUG_ON(root_cache->memcg_params->memcg_caches[id]); | 2881 | BUG_ON(root_cache->memcg_params->memcg_caches[id]); |
2873 | root_cache->memcg_params->memcg_caches[id] = cachep; | 2882 | root_cache->memcg_params->memcg_caches[id] = cachep; |
2874 | } | 2883 | } |
2875 | 2884 | ||
2876 | static void memcg_unregister_cache(struct kmem_cache *cachep) | 2885 | static void memcg_unregister_cache(struct kmem_cache *cachep) |
2877 | { | 2886 | { |
2878 | struct kmem_cache *root_cache; | 2887 | struct kmem_cache *root_cache; |
2879 | struct mem_cgroup *memcg; | 2888 | struct mem_cgroup *memcg; |
2880 | int id; | 2889 | int id; |
2881 | 2890 | ||
2882 | lockdep_assert_held(&memcg_slab_mutex); | 2891 | lockdep_assert_held(&memcg_slab_mutex); |
2883 | 2892 | ||
2884 | BUG_ON(is_root_cache(cachep)); | 2893 | BUG_ON(is_root_cache(cachep)); |
2885 | 2894 | ||
2886 | root_cache = cachep->memcg_params->root_cache; | 2895 | root_cache = cachep->memcg_params->root_cache; |
2887 | memcg = cachep->memcg_params->memcg; | 2896 | memcg = cachep->memcg_params->memcg; |
2888 | id = memcg_cache_id(memcg); | 2897 | id = memcg_cache_id(memcg); |
2889 | 2898 | ||
2890 | BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); | 2899 | BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); |
2891 | root_cache->memcg_params->memcg_caches[id] = NULL; | 2900 | root_cache->memcg_params->memcg_caches[id] = NULL; |
2892 | 2901 | ||
2893 | list_del(&cachep->memcg_params->list); | 2902 | list_del(&cachep->memcg_params->list); |
2894 | 2903 | ||
2895 | kmem_cache_destroy(cachep); | 2904 | kmem_cache_destroy(cachep); |
2896 | 2905 | ||
2897 | /* drop the reference taken in memcg_register_cache */ | 2906 | /* drop the reference taken in memcg_register_cache */ |
2898 | css_put(&memcg->css); | 2907 | css_put(&memcg->css); |
2899 | } | 2908 | } |
2900 | 2909 | ||
2901 | /* | 2910 | /* |
2902 | * During the creation a new cache, we need to disable our accounting mechanism | 2911 | * During the creation a new cache, we need to disable our accounting mechanism |
2903 | * altogether. This is true even if we are not creating, but rather just | 2912 | * altogether. This is true even if we are not creating, but rather just |
2904 | * enqueing new caches to be created. | 2913 | * enqueing new caches to be created. |
2905 | * | 2914 | * |
2906 | * This is because that process will trigger allocations; some visible, like | 2915 | * This is because that process will trigger allocations; some visible, like |
2907 | * explicit kmallocs to auxiliary data structures, name strings and internal | 2916 | * explicit kmallocs to auxiliary data structures, name strings and internal |
2908 | * cache structures; some well concealed, like INIT_WORK() that can allocate | 2917 | * cache structures; some well concealed, like INIT_WORK() that can allocate |
2909 | * objects during debug. | 2918 | * objects during debug. |
2910 | * | 2919 | * |
2911 | * If any allocation happens during memcg_kmem_get_cache, we will recurse back | 2920 | * If any allocation happens during memcg_kmem_get_cache, we will recurse back |
2912 | * to it. This may not be a bounded recursion: since the first cache creation | 2921 | * to it. This may not be a bounded recursion: since the first cache creation |
2913 | * failed to complete (waiting on the allocation), we'll just try to create the | 2922 | * failed to complete (waiting on the allocation), we'll just try to create the |
2914 | * cache again, failing at the same point. | 2923 | * cache again, failing at the same point. |
2915 | * | 2924 | * |
2916 | * memcg_kmem_get_cache is prepared to abort after seeing a positive count of | 2925 | * memcg_kmem_get_cache is prepared to abort after seeing a positive count of |
2917 | * memcg_kmem_skip_account. So we enclose anything that might allocate memory | 2926 | * memcg_kmem_skip_account. So we enclose anything that might allocate memory |
2918 | * inside the following two functions. | 2927 | * inside the following two functions. |
2919 | */ | 2928 | */ |
2920 | static inline void memcg_stop_kmem_account(void) | 2929 | static inline void memcg_stop_kmem_account(void) |
2921 | { | 2930 | { |
2922 | VM_BUG_ON(!current->mm); | 2931 | VM_BUG_ON(!current->mm); |
2923 | current->memcg_kmem_skip_account++; | 2932 | current->memcg_kmem_skip_account++; |
2924 | } | 2933 | } |
2925 | 2934 | ||
2926 | static inline void memcg_resume_kmem_account(void) | 2935 | static inline void memcg_resume_kmem_account(void) |
2927 | { | 2936 | { |
2928 | VM_BUG_ON(!current->mm); | 2937 | VM_BUG_ON(!current->mm); |
2929 | current->memcg_kmem_skip_account--; | 2938 | current->memcg_kmem_skip_account--; |
2930 | } | 2939 | } |
2931 | 2940 | ||
2932 | int __memcg_cleanup_cache_params(struct kmem_cache *s) | 2941 | int __memcg_cleanup_cache_params(struct kmem_cache *s) |
2933 | { | 2942 | { |
2934 | struct kmem_cache *c; | 2943 | struct kmem_cache *c; |
2935 | int i, failed = 0; | 2944 | int i, failed = 0; |
2936 | 2945 | ||
2937 | mutex_lock(&memcg_slab_mutex); | 2946 | mutex_lock(&memcg_slab_mutex); |
2938 | for_each_memcg_cache_index(i) { | 2947 | for_each_memcg_cache_index(i) { |
2939 | c = cache_from_memcg_idx(s, i); | 2948 | c = cache_from_memcg_idx(s, i); |
2940 | if (!c) | 2949 | if (!c) |
2941 | continue; | 2950 | continue; |
2942 | 2951 | ||
2943 | memcg_unregister_cache(c); | 2952 | memcg_unregister_cache(c); |
2944 | 2953 | ||
2945 | if (cache_from_memcg_idx(s, i)) | 2954 | if (cache_from_memcg_idx(s, i)) |
2946 | failed++; | 2955 | failed++; |
2947 | } | 2956 | } |
2948 | mutex_unlock(&memcg_slab_mutex); | 2957 | mutex_unlock(&memcg_slab_mutex); |
2949 | return failed; | 2958 | return failed; |
2950 | } | 2959 | } |
2951 | 2960 | ||
2952 | static void memcg_unregister_all_caches(struct mem_cgroup *memcg) | 2961 | static void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
2953 | { | 2962 | { |
2954 | struct kmem_cache *cachep; | 2963 | struct kmem_cache *cachep; |
2955 | struct memcg_cache_params *params, *tmp; | 2964 | struct memcg_cache_params *params, *tmp; |
2956 | 2965 | ||
2957 | if (!memcg_kmem_is_active(memcg)) | 2966 | if (!memcg_kmem_is_active(memcg)) |
2958 | return; | 2967 | return; |
2959 | 2968 | ||
2960 | mutex_lock(&memcg_slab_mutex); | 2969 | mutex_lock(&memcg_slab_mutex); |
2961 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { | 2970 | list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { |
2962 | cachep = memcg_params_to_cache(params); | 2971 | cachep = memcg_params_to_cache(params); |
2963 | kmem_cache_shrink(cachep); | 2972 | kmem_cache_shrink(cachep); |
2964 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) | 2973 | if (atomic_read(&cachep->memcg_params->nr_pages) == 0) |
2965 | memcg_unregister_cache(cachep); | 2974 | memcg_unregister_cache(cachep); |
2966 | } | 2975 | } |
2967 | mutex_unlock(&memcg_slab_mutex); | 2976 | mutex_unlock(&memcg_slab_mutex); |
2968 | } | 2977 | } |
2969 | 2978 | ||
2970 | struct memcg_register_cache_work { | 2979 | struct memcg_register_cache_work { |
2971 | struct mem_cgroup *memcg; | 2980 | struct mem_cgroup *memcg; |
2972 | struct kmem_cache *cachep; | 2981 | struct kmem_cache *cachep; |
2973 | struct work_struct work; | 2982 | struct work_struct work; |
2974 | }; | 2983 | }; |
2975 | 2984 | ||
2976 | static void memcg_register_cache_func(struct work_struct *w) | 2985 | static void memcg_register_cache_func(struct work_struct *w) |
2977 | { | 2986 | { |
2978 | struct memcg_register_cache_work *cw = | 2987 | struct memcg_register_cache_work *cw = |
2979 | container_of(w, struct memcg_register_cache_work, work); | 2988 | container_of(w, struct memcg_register_cache_work, work); |
2980 | struct mem_cgroup *memcg = cw->memcg; | 2989 | struct mem_cgroup *memcg = cw->memcg; |
2981 | struct kmem_cache *cachep = cw->cachep; | 2990 | struct kmem_cache *cachep = cw->cachep; |
2982 | 2991 | ||
2983 | mutex_lock(&memcg_slab_mutex); | 2992 | mutex_lock(&memcg_slab_mutex); |
2984 | memcg_register_cache(memcg, cachep); | 2993 | memcg_register_cache(memcg, cachep); |
2985 | mutex_unlock(&memcg_slab_mutex); | 2994 | mutex_unlock(&memcg_slab_mutex); |
2986 | 2995 | ||
2987 | css_put(&memcg->css); | 2996 | css_put(&memcg->css); |
2988 | kfree(cw); | 2997 | kfree(cw); |
2989 | } | 2998 | } |
2990 | 2999 | ||
2991 | /* | 3000 | /* |
2992 | * Enqueue the creation of a per-memcg kmem_cache. | 3001 | * Enqueue the creation of a per-memcg kmem_cache. |
2993 | */ | 3002 | */ |
2994 | static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, | 3003 | static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, |
2995 | struct kmem_cache *cachep) | 3004 | struct kmem_cache *cachep) |
2996 | { | 3005 | { |
2997 | struct memcg_register_cache_work *cw; | 3006 | struct memcg_register_cache_work *cw; |
2998 | 3007 | ||
2999 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); | 3008 | cw = kmalloc(sizeof(*cw), GFP_NOWAIT); |
3000 | if (cw == NULL) { | 3009 | if (cw == NULL) { |
3001 | css_put(&memcg->css); | 3010 | css_put(&memcg->css); |
3002 | return; | 3011 | return; |
3003 | } | 3012 | } |
3004 | 3013 | ||
3005 | cw->memcg = memcg; | 3014 | cw->memcg = memcg; |
3006 | cw->cachep = cachep; | 3015 | cw->cachep = cachep; |
3007 | 3016 | ||
3008 | INIT_WORK(&cw->work, memcg_register_cache_func); | 3017 | INIT_WORK(&cw->work, memcg_register_cache_func); |
3009 | schedule_work(&cw->work); | 3018 | schedule_work(&cw->work); |
3010 | } | 3019 | } |
3011 | 3020 | ||
3012 | static void memcg_schedule_register_cache(struct mem_cgroup *memcg, | 3021 | static void memcg_schedule_register_cache(struct mem_cgroup *memcg, |
3013 | struct kmem_cache *cachep) | 3022 | struct kmem_cache *cachep) |
3014 | { | 3023 | { |
3015 | /* | 3024 | /* |
3016 | * We need to stop accounting when we kmalloc, because if the | 3025 | * We need to stop accounting when we kmalloc, because if the |
3017 | * corresponding kmalloc cache is not yet created, the first allocation | 3026 | * corresponding kmalloc cache is not yet created, the first allocation |
3018 | * in __memcg_schedule_register_cache will recurse. | 3027 | * in __memcg_schedule_register_cache will recurse. |
3019 | * | 3028 | * |
3020 | * However, it is better to enclose the whole function. Depending on | 3029 | * However, it is better to enclose the whole function. Depending on |
3021 | * the debugging options enabled, INIT_WORK(), for instance, can | 3030 | * the debugging options enabled, INIT_WORK(), for instance, can |
3022 | * trigger an allocation. This too, will make us recurse. Because at | 3031 | * trigger an allocation. This too, will make us recurse. Because at |
3023 | * this point we can't allow ourselves back into memcg_kmem_get_cache, | 3032 | * this point we can't allow ourselves back into memcg_kmem_get_cache, |
3024 | * the safest choice is to do it like this, wrapping the whole function. | 3033 | * the safest choice is to do it like this, wrapping the whole function. |
3025 | */ | 3034 | */ |
3026 | memcg_stop_kmem_account(); | 3035 | memcg_stop_kmem_account(); |
3027 | __memcg_schedule_register_cache(memcg, cachep); | 3036 | __memcg_schedule_register_cache(memcg, cachep); |
3028 | memcg_resume_kmem_account(); | 3037 | memcg_resume_kmem_account(); |
3029 | } | 3038 | } |
3030 | 3039 | ||
3031 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) | 3040 | int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) |
3032 | { | 3041 | { |
3033 | unsigned int nr_pages = 1 << order; | 3042 | unsigned int nr_pages = 1 << order; |
3034 | int res; | 3043 | int res; |
3035 | 3044 | ||
3036 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); | 3045 | res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages); |
3037 | if (!res) | 3046 | if (!res) |
3038 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); | 3047 | atomic_add(nr_pages, &cachep->memcg_params->nr_pages); |
3039 | return res; | 3048 | return res; |
3040 | } | 3049 | } |
3041 | 3050 | ||
3042 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) | 3051 | void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) |
3043 | { | 3052 | { |
3044 | unsigned int nr_pages = 1 << order; | 3053 | unsigned int nr_pages = 1 << order; |
3045 | 3054 | ||
3046 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); | 3055 | memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages); |
3047 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); | 3056 | atomic_sub(nr_pages, &cachep->memcg_params->nr_pages); |
3048 | } | 3057 | } |
3049 | 3058 | ||
3050 | /* | 3059 | /* |
3051 | * Return the kmem_cache we're supposed to use for a slab allocation. | 3060 | * Return the kmem_cache we're supposed to use for a slab allocation. |
3052 | * We try to use the current memcg's version of the cache. | 3061 | * We try to use the current memcg's version of the cache. |
3053 | * | 3062 | * |
3054 | * If the cache does not exist yet, if we are the first user of it, | 3063 | * If the cache does not exist yet, if we are the first user of it, |
3055 | * we either create it immediately, if possible, or create it asynchronously | 3064 | * we either create it immediately, if possible, or create it asynchronously |
3056 | * in a workqueue. | 3065 | * in a workqueue. |
3057 | * In the latter case, we will let the current allocation go through with | 3066 | * In the latter case, we will let the current allocation go through with |
3058 | * the original cache. | 3067 | * the original cache. |
3059 | * | 3068 | * |
3060 | * Can't be called in interrupt context or from kernel threads. | 3069 | * Can't be called in interrupt context or from kernel threads. |
3061 | * This function needs to be called with rcu_read_lock() held. | 3070 | * This function needs to be called with rcu_read_lock() held. |
3062 | */ | 3071 | */ |
3063 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | 3072 | struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, |
3064 | gfp_t gfp) | 3073 | gfp_t gfp) |
3065 | { | 3074 | { |
3066 | struct mem_cgroup *memcg; | 3075 | struct mem_cgroup *memcg; |
3067 | struct kmem_cache *memcg_cachep; | 3076 | struct kmem_cache *memcg_cachep; |
3068 | 3077 | ||
3069 | VM_BUG_ON(!cachep->memcg_params); | 3078 | VM_BUG_ON(!cachep->memcg_params); |
3070 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); | 3079 | VM_BUG_ON(!cachep->memcg_params->is_root_cache); |
3071 | 3080 | ||
3072 | if (!current->mm || current->memcg_kmem_skip_account) | 3081 | if (!current->mm || current->memcg_kmem_skip_account) |
3073 | return cachep; | 3082 | return cachep; |
3074 | 3083 | ||
3075 | rcu_read_lock(); | 3084 | rcu_read_lock(); |
3076 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | 3085 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); |
3077 | 3086 | ||
3078 | if (!memcg_kmem_is_active(memcg)) | 3087 | if (!memcg_kmem_is_active(memcg)) |
3079 | goto out; | 3088 | goto out; |
3080 | 3089 | ||
3081 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 3090 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
3082 | if (likely(memcg_cachep)) { | 3091 | if (likely(memcg_cachep)) { |
3083 | cachep = memcg_cachep; | 3092 | cachep = memcg_cachep; |
3084 | goto out; | 3093 | goto out; |
3085 | } | 3094 | } |
3086 | 3095 | ||
3087 | /* The corresponding put will be done in the workqueue. */ | 3096 | /* The corresponding put will be done in the workqueue. */ |
3088 | if (!css_tryget_online(&memcg->css)) | 3097 | if (!css_tryget_online(&memcg->css)) |
3089 | goto out; | 3098 | goto out; |
3090 | rcu_read_unlock(); | 3099 | rcu_read_unlock(); |
3091 | 3100 | ||
3092 | /* | 3101 | /* |
3093 | * If we are in a safe context (can wait, and not in interrupt | 3102 | * If we are in a safe context (can wait, and not in interrupt |
3094 | * context), we could be be predictable and return right away. | 3103 | * context), we could be be predictable and return right away. |
3095 | * This would guarantee that the allocation being performed | 3104 | * This would guarantee that the allocation being performed |
3096 | * already belongs in the new cache. | 3105 | * already belongs in the new cache. |
3097 | * | 3106 | * |
3098 | * However, there are some clashes that can arrive from locking. | 3107 | * However, there are some clashes that can arrive from locking. |
3099 | * For instance, because we acquire the slab_mutex while doing | 3108 | * For instance, because we acquire the slab_mutex while doing |
3100 | * memcg_create_kmem_cache, this means no further allocation | 3109 | * memcg_create_kmem_cache, this means no further allocation |
3101 | * could happen with the slab_mutex held. So it's better to | 3110 | * could happen with the slab_mutex held. So it's better to |
3102 | * defer everything. | 3111 | * defer everything. |
3103 | */ | 3112 | */ |
3104 | memcg_schedule_register_cache(memcg, cachep); | 3113 | memcg_schedule_register_cache(memcg, cachep); |
3105 | return cachep; | 3114 | return cachep; |
3106 | out: | 3115 | out: |
3107 | rcu_read_unlock(); | 3116 | rcu_read_unlock(); |
3108 | return cachep; | 3117 | return cachep; |
3109 | } | 3118 | } |
3110 | 3119 | ||
3111 | /* | 3120 | /* |
3112 | * We need to verify if the allocation against current->mm->owner's memcg is | 3121 | * We need to verify if the allocation against current->mm->owner's memcg is |
3113 | * possible for the given order. But the page is not allocated yet, so we'll | 3122 | * possible for the given order. But the page is not allocated yet, so we'll |
3114 | * need a further commit step to do the final arrangements. | 3123 | * need a further commit step to do the final arrangements. |
3115 | * | 3124 | * |
3116 | * It is possible for the task to switch cgroups in this mean time, so at | 3125 | * It is possible for the task to switch cgroups in this mean time, so at |
3117 | * commit time, we can't rely on task conversion any longer. We'll then use | 3126 | * commit time, we can't rely on task conversion any longer. We'll then use |
3118 | * the handle argument to return to the caller which cgroup we should commit | 3127 | * the handle argument to return to the caller which cgroup we should commit |
3119 | * against. We could also return the memcg directly and avoid the pointer | 3128 | * against. We could also return the memcg directly and avoid the pointer |
3120 | * passing, but a boolean return value gives better semantics considering | 3129 | * passing, but a boolean return value gives better semantics considering |
3121 | * the compiled-out case as well. | 3130 | * the compiled-out case as well. |
3122 | * | 3131 | * |
3123 | * Returning true means the allocation is possible. | 3132 | * Returning true means the allocation is possible. |
3124 | */ | 3133 | */ |
3125 | bool | 3134 | bool |
3126 | __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | 3135 | __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) |
3127 | { | 3136 | { |
3128 | struct mem_cgroup *memcg; | 3137 | struct mem_cgroup *memcg; |
3129 | int ret; | 3138 | int ret; |
3130 | 3139 | ||
3131 | *_memcg = NULL; | 3140 | *_memcg = NULL; |
3132 | 3141 | ||
3133 | /* | 3142 | /* |
3134 | * Disabling accounting is only relevant for some specific memcg | 3143 | * Disabling accounting is only relevant for some specific memcg |
3135 | * internal allocations. Therefore we would initially not have such | 3144 | * internal allocations. Therefore we would initially not have such |
3136 | * check here, since direct calls to the page allocator that are | 3145 | * check here, since direct calls to the page allocator that are |
3137 | * accounted to kmemcg (alloc_kmem_pages and friends) only happen | 3146 | * accounted to kmemcg (alloc_kmem_pages and friends) only happen |
3138 | * outside memcg core. We are mostly concerned with cache allocations, | 3147 | * outside memcg core. We are mostly concerned with cache allocations, |
3139 | * and by having this test at memcg_kmem_get_cache, we are already able | 3148 | * and by having this test at memcg_kmem_get_cache, we are already able |
3140 | * to relay the allocation to the root cache and bypass the memcg cache | 3149 | * to relay the allocation to the root cache and bypass the memcg cache |
3141 | * altogether. | 3150 | * altogether. |
3142 | * | 3151 | * |
3143 | * There is one exception, though: the SLUB allocator does not create | 3152 | * There is one exception, though: the SLUB allocator does not create |
3144 | * large order caches, but rather service large kmallocs directly from | 3153 | * large order caches, but rather service large kmallocs directly from |
3145 | * the page allocator. Therefore, the following sequence when backed by | 3154 | * the page allocator. Therefore, the following sequence when backed by |
3146 | * the SLUB allocator: | 3155 | * the SLUB allocator: |
3147 | * | 3156 | * |
3148 | * memcg_stop_kmem_account(); | 3157 | * memcg_stop_kmem_account(); |
3149 | * kmalloc(<large_number>) | 3158 | * kmalloc(<large_number>) |
3150 | * memcg_resume_kmem_account(); | 3159 | * memcg_resume_kmem_account(); |
3151 | * | 3160 | * |
3152 | * would effectively ignore the fact that we should skip accounting, | 3161 | * would effectively ignore the fact that we should skip accounting, |
3153 | * since it will drive us directly to this function without passing | 3162 | * since it will drive us directly to this function without passing |
3154 | * through the cache selector memcg_kmem_get_cache. Such large | 3163 | * through the cache selector memcg_kmem_get_cache. Such large |
3155 | * allocations are extremely rare but can happen, for instance, for the | 3164 | * allocations are extremely rare but can happen, for instance, for the |
3156 | * cache arrays. We bring this test here. | 3165 | * cache arrays. We bring this test here. |
3157 | */ | 3166 | */ |
3158 | if (!current->mm || current->memcg_kmem_skip_account) | 3167 | if (!current->mm || current->memcg_kmem_skip_account) |
3159 | return true; | 3168 | return true; |
3160 | 3169 | ||
3161 | memcg = get_mem_cgroup_from_mm(current->mm); | 3170 | memcg = get_mem_cgroup_from_mm(current->mm); |
3162 | 3171 | ||
3163 | if (!memcg_kmem_is_active(memcg)) { | 3172 | if (!memcg_kmem_is_active(memcg)) { |
3164 | css_put(&memcg->css); | 3173 | css_put(&memcg->css); |
3165 | return true; | 3174 | return true; |
3166 | } | 3175 | } |
3167 | 3176 | ||
3168 | ret = memcg_charge_kmem(memcg, gfp, 1 << order); | 3177 | ret = memcg_charge_kmem(memcg, gfp, 1 << order); |
3169 | if (!ret) | 3178 | if (!ret) |
3170 | *_memcg = memcg; | 3179 | *_memcg = memcg; |
3171 | 3180 | ||
3172 | css_put(&memcg->css); | 3181 | css_put(&memcg->css); |
3173 | return (ret == 0); | 3182 | return (ret == 0); |
3174 | } | 3183 | } |
3175 | 3184 | ||
3176 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, | 3185 | void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, |
3177 | int order) | 3186 | int order) |
3178 | { | 3187 | { |
3179 | struct page_cgroup *pc; | 3188 | struct page_cgroup *pc; |
3180 | 3189 | ||
3181 | VM_BUG_ON(mem_cgroup_is_root(memcg)); | 3190 | VM_BUG_ON(mem_cgroup_is_root(memcg)); |
3182 | 3191 | ||
3183 | /* The page allocation failed. Revert */ | 3192 | /* The page allocation failed. Revert */ |
3184 | if (!page) { | 3193 | if (!page) { |
3185 | memcg_uncharge_kmem(memcg, 1 << order); | 3194 | memcg_uncharge_kmem(memcg, 1 << order); |
3186 | return; | 3195 | return; |
3187 | } | 3196 | } |
3188 | /* | 3197 | /* |
3189 | * The page is freshly allocated and not visible to any | 3198 | * The page is freshly allocated and not visible to any |
3190 | * outside callers yet. Set up pc non-atomically. | 3199 | * outside callers yet. Set up pc non-atomically. |
3191 | */ | 3200 | */ |
3192 | pc = lookup_page_cgroup(page); | 3201 | pc = lookup_page_cgroup(page); |
3193 | pc->mem_cgroup = memcg; | 3202 | pc->mem_cgroup = memcg; |
3194 | pc->flags = PCG_USED; | 3203 | pc->flags = PCG_USED; |
3195 | } | 3204 | } |
3196 | 3205 | ||
3197 | void __memcg_kmem_uncharge_pages(struct page *page, int order) | 3206 | void __memcg_kmem_uncharge_pages(struct page *page, int order) |
3198 | { | 3207 | { |
3199 | struct mem_cgroup *memcg = NULL; | 3208 | struct mem_cgroup *memcg = NULL; |
3200 | struct page_cgroup *pc; | 3209 | struct page_cgroup *pc; |
3201 | 3210 | ||
3202 | 3211 | ||
3203 | pc = lookup_page_cgroup(page); | 3212 | pc = lookup_page_cgroup(page); |
3204 | if (!PageCgroupUsed(pc)) | 3213 | if (!PageCgroupUsed(pc)) |
3205 | return; | 3214 | return; |
3206 | 3215 | ||
3207 | memcg = pc->mem_cgroup; | 3216 | memcg = pc->mem_cgroup; |
3208 | pc->flags = 0; | 3217 | pc->flags = 0; |
3209 | 3218 | ||
3210 | /* | 3219 | /* |
3211 | * We trust that only if there is a memcg associated with the page, it | 3220 | * We trust that only if there is a memcg associated with the page, it |
3212 | * is a valid allocation | 3221 | * is a valid allocation |
3213 | */ | 3222 | */ |
3214 | if (!memcg) | 3223 | if (!memcg) |
3215 | return; | 3224 | return; |
3216 | 3225 | ||
3217 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); | 3226 | VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); |
3218 | memcg_uncharge_kmem(memcg, 1 << order); | 3227 | memcg_uncharge_kmem(memcg, 1 << order); |
3219 | } | 3228 | } |
3220 | #else | 3229 | #else |
3221 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) | 3230 | static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) |
3222 | { | 3231 | { |
3223 | } | 3232 | } |
3224 | #endif /* CONFIG_MEMCG_KMEM */ | 3233 | #endif /* CONFIG_MEMCG_KMEM */ |
3225 | 3234 | ||
3226 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 3235 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
3227 | 3236 | ||
3228 | /* | 3237 | /* |
3229 | * Because tail pages are not marked as "used", set it. We're under | 3238 | * Because tail pages are not marked as "used", set it. We're under |
3230 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 3239 | * zone->lru_lock, 'splitting on pmd' and compound_lock. |
3231 | * charge/uncharge will be never happen and move_account() is done under | 3240 | * charge/uncharge will be never happen and move_account() is done under |
3232 | * compound_lock(), so we don't have to take care of races. | 3241 | * compound_lock(), so we don't have to take care of races. |
3233 | */ | 3242 | */ |
3234 | void mem_cgroup_split_huge_fixup(struct page *head) | 3243 | void mem_cgroup_split_huge_fixup(struct page *head) |
3235 | { | 3244 | { |
3236 | struct page_cgroup *head_pc = lookup_page_cgroup(head); | 3245 | struct page_cgroup *head_pc = lookup_page_cgroup(head); |
3237 | struct page_cgroup *pc; | 3246 | struct page_cgroup *pc; |
3238 | struct mem_cgroup *memcg; | 3247 | struct mem_cgroup *memcg; |
3239 | int i; | 3248 | int i; |
3240 | 3249 | ||
3241 | if (mem_cgroup_disabled()) | 3250 | if (mem_cgroup_disabled()) |
3242 | return; | 3251 | return; |
3243 | 3252 | ||
3244 | memcg = head_pc->mem_cgroup; | 3253 | memcg = head_pc->mem_cgroup; |
3245 | for (i = 1; i < HPAGE_PMD_NR; i++) { | 3254 | for (i = 1; i < HPAGE_PMD_NR; i++) { |
3246 | pc = head_pc + i; | 3255 | pc = head_pc + i; |
3247 | pc->mem_cgroup = memcg; | 3256 | pc->mem_cgroup = memcg; |
3248 | pc->flags = head_pc->flags; | 3257 | pc->flags = head_pc->flags; |
3249 | } | 3258 | } |
3250 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | 3259 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], |
3251 | HPAGE_PMD_NR); | 3260 | HPAGE_PMD_NR); |
3252 | } | 3261 | } |
3253 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 3262 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
3254 | 3263 | ||
3255 | /** | 3264 | /** |
3256 | * mem_cgroup_move_account - move account of the page | 3265 | * mem_cgroup_move_account - move account of the page |
3257 | * @page: the page | 3266 | * @page: the page |
3258 | * @nr_pages: number of regular pages (>1 for huge pages) | 3267 | * @nr_pages: number of regular pages (>1 for huge pages) |
3259 | * @pc: page_cgroup of the page. | 3268 | * @pc: page_cgroup of the page. |
3260 | * @from: mem_cgroup which the page is moved from. | 3269 | * @from: mem_cgroup which the page is moved from. |
3261 | * @to: mem_cgroup which the page is moved to. @from != @to. | 3270 | * @to: mem_cgroup which the page is moved to. @from != @to. |
3262 | * | 3271 | * |
3263 | * The caller must confirm following. | 3272 | * The caller must confirm following. |
3264 | * - page is not on LRU (isolate_page() is useful.) | 3273 | * - page is not on LRU (isolate_page() is useful.) |
3265 | * - compound_lock is held when nr_pages > 1 | 3274 | * - compound_lock is held when nr_pages > 1 |
3266 | * | 3275 | * |
3267 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" | 3276 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" |
3268 | * from old cgroup. | 3277 | * from old cgroup. |
3269 | */ | 3278 | */ |
3270 | static int mem_cgroup_move_account(struct page *page, | 3279 | static int mem_cgroup_move_account(struct page *page, |
3271 | unsigned int nr_pages, | 3280 | unsigned int nr_pages, |
3272 | struct page_cgroup *pc, | 3281 | struct page_cgroup *pc, |
3273 | struct mem_cgroup *from, | 3282 | struct mem_cgroup *from, |
3274 | struct mem_cgroup *to) | 3283 | struct mem_cgroup *to) |
3275 | { | 3284 | { |
3276 | unsigned long flags; | 3285 | unsigned long flags; |
3277 | int ret; | 3286 | int ret; |
3278 | 3287 | ||
3279 | VM_BUG_ON(from == to); | 3288 | VM_BUG_ON(from == to); |
3280 | VM_BUG_ON_PAGE(PageLRU(page), page); | 3289 | VM_BUG_ON_PAGE(PageLRU(page), page); |
3281 | /* | 3290 | /* |
3282 | * The page is isolated from LRU. So, collapse function | 3291 | * The page is isolated from LRU. So, collapse function |
3283 | * will not handle this page. But page splitting can happen. | 3292 | * will not handle this page. But page splitting can happen. |
3284 | * Do this check under compound_page_lock(). The caller should | 3293 | * Do this check under compound_page_lock(). The caller should |
3285 | * hold it. | 3294 | * hold it. |
3286 | */ | 3295 | */ |
3287 | ret = -EBUSY; | 3296 | ret = -EBUSY; |
3288 | if (nr_pages > 1 && !PageTransHuge(page)) | 3297 | if (nr_pages > 1 && !PageTransHuge(page)) |
3289 | goto out; | 3298 | goto out; |
3290 | 3299 | ||
3291 | /* | 3300 | /* |
3292 | * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup | 3301 | * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup |
3293 | * of its source page while we change it: page migration takes | 3302 | * of its source page while we change it: page migration takes |
3294 | * both pages off the LRU, but page cache replacement doesn't. | 3303 | * both pages off the LRU, but page cache replacement doesn't. |
3295 | */ | 3304 | */ |
3296 | if (!trylock_page(page)) | 3305 | if (!trylock_page(page)) |
3297 | goto out; | 3306 | goto out; |
3298 | 3307 | ||
3299 | ret = -EINVAL; | 3308 | ret = -EINVAL; |
3300 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) | 3309 | if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) |
3301 | goto out_unlock; | 3310 | goto out_unlock; |
3302 | 3311 | ||
3303 | move_lock_mem_cgroup(from, &flags); | 3312 | move_lock_mem_cgroup(from, &flags); |
3304 | 3313 | ||
3305 | if (!PageAnon(page) && page_mapped(page)) { | 3314 | if (!PageAnon(page) && page_mapped(page)) { |
3306 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 3315 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], |
3307 | nr_pages); | 3316 | nr_pages); |
3308 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | 3317 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], |
3309 | nr_pages); | 3318 | nr_pages); |
3310 | } | 3319 | } |
3311 | 3320 | ||
3312 | if (PageWriteback(page)) { | 3321 | if (PageWriteback(page)) { |
3313 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], | 3322 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], |
3314 | nr_pages); | 3323 | nr_pages); |
3315 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], | 3324 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], |
3316 | nr_pages); | 3325 | nr_pages); |
3317 | } | 3326 | } |
3318 | 3327 | ||
3319 | /* | 3328 | /* |
3320 | * It is safe to change pc->mem_cgroup here because the page | 3329 | * It is safe to change pc->mem_cgroup here because the page |
3321 | * is referenced, charged, and isolated - we can't race with | 3330 | * is referenced, charged, and isolated - we can't race with |
3322 | * uncharging, charging, migration, or LRU putback. | 3331 | * uncharging, charging, migration, or LRU putback. |
3323 | */ | 3332 | */ |
3324 | 3333 | ||
3325 | /* caller should have done css_get */ | 3334 | /* caller should have done css_get */ |
3326 | pc->mem_cgroup = to; | 3335 | pc->mem_cgroup = to; |
3327 | move_unlock_mem_cgroup(from, &flags); | 3336 | move_unlock_mem_cgroup(from, &flags); |
3328 | ret = 0; | 3337 | ret = 0; |
3329 | 3338 | ||
3330 | local_irq_disable(); | 3339 | local_irq_disable(); |
3331 | mem_cgroup_charge_statistics(to, page, nr_pages); | 3340 | mem_cgroup_charge_statistics(to, page, nr_pages); |
3332 | memcg_check_events(to, page); | 3341 | memcg_check_events(to, page); |
3333 | mem_cgroup_charge_statistics(from, page, -nr_pages); | 3342 | mem_cgroup_charge_statistics(from, page, -nr_pages); |
3334 | memcg_check_events(from, page); | 3343 | memcg_check_events(from, page); |
3335 | local_irq_enable(); | 3344 | local_irq_enable(); |
3336 | out_unlock: | 3345 | out_unlock: |
3337 | unlock_page(page); | 3346 | unlock_page(page); |
3338 | out: | 3347 | out: |
3339 | return ret; | 3348 | return ret; |
3340 | } | 3349 | } |
3341 | 3350 | ||
3342 | /** | 3351 | /** |
3343 | * mem_cgroup_move_parent - moves page to the parent group | 3352 | * mem_cgroup_move_parent - moves page to the parent group |
3344 | * @page: the page to move | 3353 | * @page: the page to move |
3345 | * @pc: page_cgroup of the page | 3354 | * @pc: page_cgroup of the page |
3346 | * @child: page's cgroup | 3355 | * @child: page's cgroup |
3347 | * | 3356 | * |
3348 | * move charges to its parent or the root cgroup if the group has no | 3357 | * move charges to its parent or the root cgroup if the group has no |
3349 | * parent (aka use_hierarchy==0). | 3358 | * parent (aka use_hierarchy==0). |
3350 | * Although this might fail (get_page_unless_zero, isolate_lru_page or | 3359 | * Although this might fail (get_page_unless_zero, isolate_lru_page or |
3351 | * mem_cgroup_move_account fails) the failure is always temporary and | 3360 | * mem_cgroup_move_account fails) the failure is always temporary and |
3352 | * it signals a race with a page removal/uncharge or migration. In the | 3361 | * it signals a race with a page removal/uncharge or migration. In the |
3353 | * first case the page is on the way out and it will vanish from the LRU | 3362 | * first case the page is on the way out and it will vanish from the LRU |
3354 | * on the next attempt and the call should be retried later. | 3363 | * on the next attempt and the call should be retried later. |
3355 | * Isolation from the LRU fails only if page has been isolated from | 3364 | * Isolation from the LRU fails only if page has been isolated from |
3356 | * the LRU since we looked at it and that usually means either global | 3365 | * the LRU since we looked at it and that usually means either global |
3357 | * reclaim or migration going on. The page will either get back to the | 3366 | * reclaim or migration going on. The page will either get back to the |
3358 | * LRU or vanish. | 3367 | * LRU or vanish. |
3359 | * Finaly mem_cgroup_move_account fails only if the page got uncharged | 3368 | * Finaly mem_cgroup_move_account fails only if the page got uncharged |
3360 | * (!PageCgroupUsed) or moved to a different group. The page will | 3369 | * (!PageCgroupUsed) or moved to a different group. The page will |
3361 | * disappear in the next attempt. | 3370 | * disappear in the next attempt. |
3362 | */ | 3371 | */ |
3363 | static int mem_cgroup_move_parent(struct page *page, | 3372 | static int mem_cgroup_move_parent(struct page *page, |
3364 | struct page_cgroup *pc, | 3373 | struct page_cgroup *pc, |
3365 | struct mem_cgroup *child) | 3374 | struct mem_cgroup *child) |
3366 | { | 3375 | { |
3367 | struct mem_cgroup *parent; | 3376 | struct mem_cgroup *parent; |
3368 | unsigned int nr_pages; | 3377 | unsigned int nr_pages; |
3369 | unsigned long uninitialized_var(flags); | 3378 | unsigned long uninitialized_var(flags); |
3370 | int ret; | 3379 | int ret; |
3371 | 3380 | ||
3372 | VM_BUG_ON(mem_cgroup_is_root(child)); | 3381 | VM_BUG_ON(mem_cgroup_is_root(child)); |
3373 | 3382 | ||
3374 | ret = -EBUSY; | 3383 | ret = -EBUSY; |
3375 | if (!get_page_unless_zero(page)) | 3384 | if (!get_page_unless_zero(page)) |
3376 | goto out; | 3385 | goto out; |
3377 | if (isolate_lru_page(page)) | 3386 | if (isolate_lru_page(page)) |
3378 | goto put; | 3387 | goto put; |
3379 | 3388 | ||
3380 | nr_pages = hpage_nr_pages(page); | 3389 | nr_pages = hpage_nr_pages(page); |
3381 | 3390 | ||
3382 | parent = parent_mem_cgroup(child); | 3391 | parent = parent_mem_cgroup(child); |
3383 | /* | 3392 | /* |
3384 | * If no parent, move charges to root cgroup. | 3393 | * If no parent, move charges to root cgroup. |
3385 | */ | 3394 | */ |
3386 | if (!parent) | 3395 | if (!parent) |
3387 | parent = root_mem_cgroup; | 3396 | parent = root_mem_cgroup; |
3388 | 3397 | ||
3389 | if (nr_pages > 1) { | 3398 | if (nr_pages > 1) { |
3390 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 3399 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
3391 | flags = compound_lock_irqsave(page); | 3400 | flags = compound_lock_irqsave(page); |
3392 | } | 3401 | } |
3393 | 3402 | ||
3394 | ret = mem_cgroup_move_account(page, nr_pages, | 3403 | ret = mem_cgroup_move_account(page, nr_pages, |
3395 | pc, child, parent); | 3404 | pc, child, parent); |
3396 | if (!ret) { | 3405 | if (!ret) { |
3406 | if (!mem_cgroup_is_root(parent)) | ||
3407 | css_get_many(&parent->css, nr_pages); | ||
3397 | /* Take charge off the local counters */ | 3408 | /* Take charge off the local counters */ |
3398 | page_counter_cancel(&child->memory, nr_pages); | 3409 | page_counter_cancel(&child->memory, nr_pages); |
3399 | if (do_swap_account) | 3410 | if (do_swap_account) |
3400 | page_counter_cancel(&child->memsw, nr_pages); | 3411 | page_counter_cancel(&child->memsw, nr_pages); |
3412 | css_put_many(&child->css, nr_pages); | ||
3401 | } | 3413 | } |
3402 | 3414 | ||
3403 | if (nr_pages > 1) | 3415 | if (nr_pages > 1) |
3404 | compound_unlock_irqrestore(page, flags); | 3416 | compound_unlock_irqrestore(page, flags); |
3405 | putback_lru_page(page); | 3417 | putback_lru_page(page); |
3406 | put: | 3418 | put: |
3407 | put_page(page); | 3419 | put_page(page); |
3408 | out: | 3420 | out: |
3409 | return ret; | 3421 | return ret; |
3410 | } | 3422 | } |
3411 | 3423 | ||
3412 | #ifdef CONFIG_MEMCG_SWAP | 3424 | #ifdef CONFIG_MEMCG_SWAP |
3413 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | 3425 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, |
3414 | bool charge) | 3426 | bool charge) |
3415 | { | 3427 | { |
3416 | int val = (charge) ? 1 : -1; | 3428 | int val = (charge) ? 1 : -1; |
3417 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); | 3429 | this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); |
3418 | } | 3430 | } |
3419 | 3431 | ||
3420 | /** | 3432 | /** |
3421 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | 3433 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. |
3422 | * @entry: swap entry to be moved | 3434 | * @entry: swap entry to be moved |
3423 | * @from: mem_cgroup which the entry is moved from | 3435 | * @from: mem_cgroup which the entry is moved from |
3424 | * @to: mem_cgroup which the entry is moved to | 3436 | * @to: mem_cgroup which the entry is moved to |
3425 | * | 3437 | * |
3426 | * It succeeds only when the swap_cgroup's record for this entry is the same | 3438 | * It succeeds only when the swap_cgroup's record for this entry is the same |
3427 | * as the mem_cgroup's id of @from. | 3439 | * as the mem_cgroup's id of @from. |
3428 | * | 3440 | * |
3429 | * Returns 0 on success, -EINVAL on failure. | 3441 | * Returns 0 on success, -EINVAL on failure. |
3430 | * | 3442 | * |
3431 | * The caller must have charged to @to, IOW, called page_counter_charge() about | 3443 | * The caller must have charged to @to, IOW, called page_counter_charge() about |
3432 | * both res and memsw, and called css_get(). | 3444 | * both res and memsw, and called css_get(). |
3433 | */ | 3445 | */ |
3434 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | 3446 | static int mem_cgroup_move_swap_account(swp_entry_t entry, |
3435 | struct mem_cgroup *from, struct mem_cgroup *to) | 3447 | struct mem_cgroup *from, struct mem_cgroup *to) |
3436 | { | 3448 | { |
3437 | unsigned short old_id, new_id; | 3449 | unsigned short old_id, new_id; |
3438 | 3450 | ||
3439 | old_id = mem_cgroup_id(from); | 3451 | old_id = mem_cgroup_id(from); |
3440 | new_id = mem_cgroup_id(to); | 3452 | new_id = mem_cgroup_id(to); |
3441 | 3453 | ||
3442 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | 3454 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { |
3443 | mem_cgroup_swap_statistics(from, false); | 3455 | mem_cgroup_swap_statistics(from, false); |
3444 | mem_cgroup_swap_statistics(to, true); | 3456 | mem_cgroup_swap_statistics(to, true); |
3445 | /* | 3457 | /* |
3446 | * This function is only called from task migration context now. | 3458 | * This function is only called from task migration context now. |
3447 | * It postpones page_counter and refcount handling till the end | 3459 | * It postpones page_counter and refcount handling till the end |
3448 | * of task migration(mem_cgroup_clear_mc()) for performance | 3460 | * of task migration(mem_cgroup_clear_mc()) for performance |
3449 | * improvement. But we cannot postpone css_get(to) because if | 3461 | * improvement. But we cannot postpone css_get(to) because if |
3450 | * the process that has been moved to @to does swap-in, the | 3462 | * the process that has been moved to @to does swap-in, the |
3451 | * refcount of @to might be decreased to 0. | 3463 | * refcount of @to might be decreased to 0. |
3452 | * | 3464 | * |
3453 | * We are in attach() phase, so the cgroup is guaranteed to be | 3465 | * We are in attach() phase, so the cgroup is guaranteed to be |
3454 | * alive, so we can just call css_get(). | 3466 | * alive, so we can just call css_get(). |
3455 | */ | 3467 | */ |
3456 | css_get(&to->css); | 3468 | css_get(&to->css); |
3457 | return 0; | 3469 | return 0; |
3458 | } | 3470 | } |
3459 | return -EINVAL; | 3471 | return -EINVAL; |
3460 | } | 3472 | } |
3461 | #else | 3473 | #else |
3462 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | 3474 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, |
3463 | struct mem_cgroup *from, struct mem_cgroup *to) | 3475 | struct mem_cgroup *from, struct mem_cgroup *to) |
3464 | { | 3476 | { |
3465 | return -EINVAL; | 3477 | return -EINVAL; |
3466 | } | 3478 | } |
3467 | #endif | 3479 | #endif |
3468 | 3480 | ||
3469 | #ifdef CONFIG_DEBUG_VM | 3481 | #ifdef CONFIG_DEBUG_VM |
3470 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) | 3482 | static struct page_cgroup *lookup_page_cgroup_used(struct page *page) |
3471 | { | 3483 | { |
3472 | struct page_cgroup *pc; | 3484 | struct page_cgroup *pc; |
3473 | 3485 | ||
3474 | pc = lookup_page_cgroup(page); | 3486 | pc = lookup_page_cgroup(page); |
3475 | /* | 3487 | /* |
3476 | * Can be NULL while feeding pages into the page allocator for | 3488 | * Can be NULL while feeding pages into the page allocator for |
3477 | * the first time, i.e. during boot or memory hotplug; | 3489 | * the first time, i.e. during boot or memory hotplug; |
3478 | * or when mem_cgroup_disabled(). | 3490 | * or when mem_cgroup_disabled(). |
3479 | */ | 3491 | */ |
3480 | if (likely(pc) && PageCgroupUsed(pc)) | 3492 | if (likely(pc) && PageCgroupUsed(pc)) |
3481 | return pc; | 3493 | return pc; |
3482 | return NULL; | 3494 | return NULL; |
3483 | } | 3495 | } |
3484 | 3496 | ||
3485 | bool mem_cgroup_bad_page_check(struct page *page) | 3497 | bool mem_cgroup_bad_page_check(struct page *page) |
3486 | { | 3498 | { |
3487 | if (mem_cgroup_disabled()) | 3499 | if (mem_cgroup_disabled()) |
3488 | return false; | 3500 | return false; |
3489 | 3501 | ||
3490 | return lookup_page_cgroup_used(page) != NULL; | 3502 | return lookup_page_cgroup_used(page) != NULL; |
3491 | } | 3503 | } |
3492 | 3504 | ||
3493 | void mem_cgroup_print_bad_page(struct page *page) | 3505 | void mem_cgroup_print_bad_page(struct page *page) |
3494 | { | 3506 | { |
3495 | struct page_cgroup *pc; | 3507 | struct page_cgroup *pc; |
3496 | 3508 | ||
3497 | pc = lookup_page_cgroup_used(page); | 3509 | pc = lookup_page_cgroup_used(page); |
3498 | if (pc) { | 3510 | if (pc) { |
3499 | pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", | 3511 | pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", |
3500 | pc, pc->flags, pc->mem_cgroup); | 3512 | pc, pc->flags, pc->mem_cgroup); |
3501 | } | 3513 | } |
3502 | } | 3514 | } |
3503 | #endif | 3515 | #endif |
3504 | 3516 | ||
3505 | static DEFINE_MUTEX(memcg_limit_mutex); | 3517 | static DEFINE_MUTEX(memcg_limit_mutex); |
3506 | 3518 | ||
3507 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 3519 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
3508 | unsigned long limit) | 3520 | unsigned long limit) |
3509 | { | 3521 | { |
3510 | unsigned long curusage; | 3522 | unsigned long curusage; |
3511 | unsigned long oldusage; | 3523 | unsigned long oldusage; |
3512 | bool enlarge = false; | 3524 | bool enlarge = false; |
3513 | int retry_count; | 3525 | int retry_count; |
3514 | int ret; | 3526 | int ret; |
3515 | 3527 | ||
3516 | /* | 3528 | /* |
3517 | * For keeping hierarchical_reclaim simple, how long we should retry | 3529 | * For keeping hierarchical_reclaim simple, how long we should retry |
3518 | * is depends on callers. We set our retry-count to be function | 3530 | * is depends on callers. We set our retry-count to be function |
3519 | * of # of children which we should visit in this loop. | 3531 | * of # of children which we should visit in this loop. |
3520 | */ | 3532 | */ |
3521 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * | 3533 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * |
3522 | mem_cgroup_count_children(memcg); | 3534 | mem_cgroup_count_children(memcg); |
3523 | 3535 | ||
3524 | oldusage = page_counter_read(&memcg->memory); | 3536 | oldusage = page_counter_read(&memcg->memory); |
3525 | 3537 | ||
3526 | do { | 3538 | do { |
3527 | if (signal_pending(current)) { | 3539 | if (signal_pending(current)) { |
3528 | ret = -EINTR; | 3540 | ret = -EINTR; |
3529 | break; | 3541 | break; |
3530 | } | 3542 | } |
3531 | 3543 | ||
3532 | mutex_lock(&memcg_limit_mutex); | 3544 | mutex_lock(&memcg_limit_mutex); |
3533 | if (limit > memcg->memsw.limit) { | 3545 | if (limit > memcg->memsw.limit) { |
3534 | mutex_unlock(&memcg_limit_mutex); | 3546 | mutex_unlock(&memcg_limit_mutex); |
3535 | ret = -EINVAL; | 3547 | ret = -EINVAL; |
3536 | break; | 3548 | break; |
3537 | } | 3549 | } |
3538 | if (limit > memcg->memory.limit) | 3550 | if (limit > memcg->memory.limit) |
3539 | enlarge = true; | 3551 | enlarge = true; |
3540 | ret = page_counter_limit(&memcg->memory, limit); | 3552 | ret = page_counter_limit(&memcg->memory, limit); |
3541 | mutex_unlock(&memcg_limit_mutex); | 3553 | mutex_unlock(&memcg_limit_mutex); |
3542 | 3554 | ||
3543 | if (!ret) | 3555 | if (!ret) |
3544 | break; | 3556 | break; |
3545 | 3557 | ||
3546 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); | 3558 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); |
3547 | 3559 | ||
3548 | curusage = page_counter_read(&memcg->memory); | 3560 | curusage = page_counter_read(&memcg->memory); |
3549 | /* Usage is reduced ? */ | 3561 | /* Usage is reduced ? */ |
3550 | if (curusage >= oldusage) | 3562 | if (curusage >= oldusage) |
3551 | retry_count--; | 3563 | retry_count--; |
3552 | else | 3564 | else |
3553 | oldusage = curusage; | 3565 | oldusage = curusage; |
3554 | } while (retry_count); | 3566 | } while (retry_count); |
3555 | 3567 | ||
3556 | if (!ret && enlarge) | 3568 | if (!ret && enlarge) |
3557 | memcg_oom_recover(memcg); | 3569 | memcg_oom_recover(memcg); |
3558 | 3570 | ||
3559 | return ret; | 3571 | return ret; |
3560 | } | 3572 | } |
3561 | 3573 | ||
3562 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | 3574 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, |
3563 | unsigned long limit) | 3575 | unsigned long limit) |
3564 | { | 3576 | { |
3565 | unsigned long curusage; | 3577 | unsigned long curusage; |
3566 | unsigned long oldusage; | 3578 | unsigned long oldusage; |
3567 | bool enlarge = false; | 3579 | bool enlarge = false; |
3568 | int retry_count; | 3580 | int retry_count; |
3569 | int ret; | 3581 | int ret; |
3570 | 3582 | ||
3571 | /* see mem_cgroup_resize_res_limit */ | 3583 | /* see mem_cgroup_resize_res_limit */ |
3572 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * | 3584 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * |
3573 | mem_cgroup_count_children(memcg); | 3585 | mem_cgroup_count_children(memcg); |
3574 | 3586 | ||
3575 | oldusage = page_counter_read(&memcg->memsw); | 3587 | oldusage = page_counter_read(&memcg->memsw); |
3576 | 3588 | ||
3577 | do { | 3589 | do { |
3578 | if (signal_pending(current)) { | 3590 | if (signal_pending(current)) { |
3579 | ret = -EINTR; | 3591 | ret = -EINTR; |
3580 | break; | 3592 | break; |
3581 | } | 3593 | } |
3582 | 3594 | ||
3583 | mutex_lock(&memcg_limit_mutex); | 3595 | mutex_lock(&memcg_limit_mutex); |
3584 | if (limit < memcg->memory.limit) { | 3596 | if (limit < memcg->memory.limit) { |
3585 | mutex_unlock(&memcg_limit_mutex); | 3597 | mutex_unlock(&memcg_limit_mutex); |
3586 | ret = -EINVAL; | 3598 | ret = -EINVAL; |
3587 | break; | 3599 | break; |
3588 | } | 3600 | } |
3589 | if (limit > memcg->memsw.limit) | 3601 | if (limit > memcg->memsw.limit) |
3590 | enlarge = true; | 3602 | enlarge = true; |
3591 | ret = page_counter_limit(&memcg->memsw, limit); | 3603 | ret = page_counter_limit(&memcg->memsw, limit); |
3592 | mutex_unlock(&memcg_limit_mutex); | 3604 | mutex_unlock(&memcg_limit_mutex); |
3593 | 3605 | ||
3594 | if (!ret) | 3606 | if (!ret) |
3595 | break; | 3607 | break; |
3596 | 3608 | ||
3597 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); | 3609 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); |
3598 | 3610 | ||
3599 | curusage = page_counter_read(&memcg->memsw); | 3611 | curusage = page_counter_read(&memcg->memsw); |
3600 | /* Usage is reduced ? */ | 3612 | /* Usage is reduced ? */ |
3601 | if (curusage >= oldusage) | 3613 | if (curusage >= oldusage) |
3602 | retry_count--; | 3614 | retry_count--; |
3603 | else | 3615 | else |
3604 | oldusage = curusage; | 3616 | oldusage = curusage; |
3605 | } while (retry_count); | 3617 | } while (retry_count); |
3606 | 3618 | ||
3607 | if (!ret && enlarge) | 3619 | if (!ret && enlarge) |
3608 | memcg_oom_recover(memcg); | 3620 | memcg_oom_recover(memcg); |
3609 | 3621 | ||
3610 | return ret; | 3622 | return ret; |
3611 | } | 3623 | } |
3612 | 3624 | ||
3613 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 3625 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
3614 | gfp_t gfp_mask, | 3626 | gfp_t gfp_mask, |
3615 | unsigned long *total_scanned) | 3627 | unsigned long *total_scanned) |
3616 | { | 3628 | { |
3617 | unsigned long nr_reclaimed = 0; | 3629 | unsigned long nr_reclaimed = 0; |
3618 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | 3630 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; |
3619 | unsigned long reclaimed; | 3631 | unsigned long reclaimed; |
3620 | int loop = 0; | 3632 | int loop = 0; |
3621 | struct mem_cgroup_tree_per_zone *mctz; | 3633 | struct mem_cgroup_tree_per_zone *mctz; |
3622 | unsigned long excess; | 3634 | unsigned long excess; |
3623 | unsigned long nr_scanned; | 3635 | unsigned long nr_scanned; |
3624 | 3636 | ||
3625 | if (order > 0) | 3637 | if (order > 0) |
3626 | return 0; | 3638 | return 0; |
3627 | 3639 | ||
3628 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); | 3640 | mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); |
3629 | /* | 3641 | /* |
3630 | * This loop can run a while, specially if mem_cgroup's continuously | 3642 | * This loop can run a while, specially if mem_cgroup's continuously |
3631 | * keep exceeding their soft limit and putting the system under | 3643 | * keep exceeding their soft limit and putting the system under |
3632 | * pressure | 3644 | * pressure |
3633 | */ | 3645 | */ |
3634 | do { | 3646 | do { |
3635 | if (next_mz) | 3647 | if (next_mz) |
3636 | mz = next_mz; | 3648 | mz = next_mz; |
3637 | else | 3649 | else |
3638 | mz = mem_cgroup_largest_soft_limit_node(mctz); | 3650 | mz = mem_cgroup_largest_soft_limit_node(mctz); |
3639 | if (!mz) | 3651 | if (!mz) |
3640 | break; | 3652 | break; |
3641 | 3653 | ||
3642 | nr_scanned = 0; | 3654 | nr_scanned = 0; |
3643 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, | 3655 | reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, |
3644 | gfp_mask, &nr_scanned); | 3656 | gfp_mask, &nr_scanned); |
3645 | nr_reclaimed += reclaimed; | 3657 | nr_reclaimed += reclaimed; |
3646 | *total_scanned += nr_scanned; | 3658 | *total_scanned += nr_scanned; |
3647 | spin_lock_irq(&mctz->lock); | 3659 | spin_lock_irq(&mctz->lock); |
3648 | 3660 | ||
3649 | /* | 3661 | /* |
3650 | * If we failed to reclaim anything from this memory cgroup | 3662 | * If we failed to reclaim anything from this memory cgroup |
3651 | * it is time to move on to the next cgroup | 3663 | * it is time to move on to the next cgroup |
3652 | */ | 3664 | */ |
3653 | next_mz = NULL; | 3665 | next_mz = NULL; |
3654 | if (!reclaimed) { | 3666 | if (!reclaimed) { |
3655 | do { | 3667 | do { |
3656 | /* | 3668 | /* |
3657 | * Loop until we find yet another one. | 3669 | * Loop until we find yet another one. |
3658 | * | 3670 | * |
3659 | * By the time we get the soft_limit lock | 3671 | * By the time we get the soft_limit lock |
3660 | * again, someone might have aded the | 3672 | * again, someone might have aded the |
3661 | * group back on the RB tree. Iterate to | 3673 | * group back on the RB tree. Iterate to |
3662 | * make sure we get a different mem. | 3674 | * make sure we get a different mem. |
3663 | * mem_cgroup_largest_soft_limit_node returns | 3675 | * mem_cgroup_largest_soft_limit_node returns |
3664 | * NULL if no other cgroup is present on | 3676 | * NULL if no other cgroup is present on |
3665 | * the tree | 3677 | * the tree |
3666 | */ | 3678 | */ |
3667 | next_mz = | 3679 | next_mz = |
3668 | __mem_cgroup_largest_soft_limit_node(mctz); | 3680 | __mem_cgroup_largest_soft_limit_node(mctz); |
3669 | if (next_mz == mz) | 3681 | if (next_mz == mz) |
3670 | css_put(&next_mz->memcg->css); | 3682 | css_put(&next_mz->memcg->css); |
3671 | else /* next_mz == NULL or other memcg */ | 3683 | else /* next_mz == NULL or other memcg */ |
3672 | break; | 3684 | break; |
3673 | } while (1); | 3685 | } while (1); |
3674 | } | 3686 | } |
3675 | __mem_cgroup_remove_exceeded(mz, mctz); | 3687 | __mem_cgroup_remove_exceeded(mz, mctz); |
3676 | excess = soft_limit_excess(mz->memcg); | 3688 | excess = soft_limit_excess(mz->memcg); |
3677 | /* | 3689 | /* |
3678 | * One school of thought says that we should not add | 3690 | * One school of thought says that we should not add |
3679 | * back the node to the tree if reclaim returns 0. | 3691 | * back the node to the tree if reclaim returns 0. |
3680 | * But our reclaim could return 0, simply because due | 3692 | * But our reclaim could return 0, simply because due |
3681 | * to priority we are exposing a smaller subset of | 3693 | * to priority we are exposing a smaller subset of |
3682 | * memory to reclaim from. Consider this as a longer | 3694 | * memory to reclaim from. Consider this as a longer |
3683 | * term TODO. | 3695 | * term TODO. |
3684 | */ | 3696 | */ |
3685 | /* If excess == 0, no tree ops */ | 3697 | /* If excess == 0, no tree ops */ |
3686 | __mem_cgroup_insert_exceeded(mz, mctz, excess); | 3698 | __mem_cgroup_insert_exceeded(mz, mctz, excess); |
3687 | spin_unlock_irq(&mctz->lock); | 3699 | spin_unlock_irq(&mctz->lock); |
3688 | css_put(&mz->memcg->css); | 3700 | css_put(&mz->memcg->css); |
3689 | loop++; | 3701 | loop++; |
3690 | /* | 3702 | /* |
3691 | * Could not reclaim anything and there are no more | 3703 | * Could not reclaim anything and there are no more |
3692 | * mem cgroups to try or we seem to be looping without | 3704 | * mem cgroups to try or we seem to be looping without |
3693 | * reclaiming anything. | 3705 | * reclaiming anything. |
3694 | */ | 3706 | */ |
3695 | if (!nr_reclaimed && | 3707 | if (!nr_reclaimed && |
3696 | (next_mz == NULL || | 3708 | (next_mz == NULL || |
3697 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | 3709 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) |
3698 | break; | 3710 | break; |
3699 | } while (!nr_reclaimed); | 3711 | } while (!nr_reclaimed); |
3700 | if (next_mz) | 3712 | if (next_mz) |
3701 | css_put(&next_mz->memcg->css); | 3713 | css_put(&next_mz->memcg->css); |
3702 | return nr_reclaimed; | 3714 | return nr_reclaimed; |
3703 | } | 3715 | } |
3704 | 3716 | ||
3705 | /** | 3717 | /** |
3706 | * mem_cgroup_force_empty_list - clears LRU of a group | 3718 | * mem_cgroup_force_empty_list - clears LRU of a group |
3707 | * @memcg: group to clear | 3719 | * @memcg: group to clear |
3708 | * @node: NUMA node | 3720 | * @node: NUMA node |
3709 | * @zid: zone id | 3721 | * @zid: zone id |
3710 | * @lru: lru to to clear | 3722 | * @lru: lru to to clear |
3711 | * | 3723 | * |
3712 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't | 3724 | * Traverse a specified page_cgroup list and try to drop them all. This doesn't |
3713 | * reclaim the pages page themselves - pages are moved to the parent (or root) | 3725 | * reclaim the pages page themselves - pages are moved to the parent (or root) |
3714 | * group. | 3726 | * group. |
3715 | */ | 3727 | */ |
3716 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, | 3728 | static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, |
3717 | int node, int zid, enum lru_list lru) | 3729 | int node, int zid, enum lru_list lru) |
3718 | { | 3730 | { |
3719 | struct lruvec *lruvec; | 3731 | struct lruvec *lruvec; |
3720 | unsigned long flags; | 3732 | unsigned long flags; |
3721 | struct list_head *list; | 3733 | struct list_head *list; |
3722 | struct page *busy; | 3734 | struct page *busy; |
3723 | struct zone *zone; | 3735 | struct zone *zone; |
3724 | 3736 | ||
3725 | zone = &NODE_DATA(node)->node_zones[zid]; | 3737 | zone = &NODE_DATA(node)->node_zones[zid]; |
3726 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); | 3738 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
3727 | list = &lruvec->lists[lru]; | 3739 | list = &lruvec->lists[lru]; |
3728 | 3740 | ||
3729 | busy = NULL; | 3741 | busy = NULL; |
3730 | do { | 3742 | do { |
3731 | struct page_cgroup *pc; | 3743 | struct page_cgroup *pc; |
3732 | struct page *page; | 3744 | struct page *page; |
3733 | 3745 | ||
3734 | spin_lock_irqsave(&zone->lru_lock, flags); | 3746 | spin_lock_irqsave(&zone->lru_lock, flags); |
3735 | if (list_empty(list)) { | 3747 | if (list_empty(list)) { |
3736 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3748 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3737 | break; | 3749 | break; |
3738 | } | 3750 | } |
3739 | page = list_entry(list->prev, struct page, lru); | 3751 | page = list_entry(list->prev, struct page, lru); |
3740 | if (busy == page) { | 3752 | if (busy == page) { |
3741 | list_move(&page->lru, list); | 3753 | list_move(&page->lru, list); |
3742 | busy = NULL; | 3754 | busy = NULL; |
3743 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3755 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3744 | continue; | 3756 | continue; |
3745 | } | 3757 | } |
3746 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 3758 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
3747 | 3759 | ||
3748 | pc = lookup_page_cgroup(page); | 3760 | pc = lookup_page_cgroup(page); |
3749 | 3761 | ||
3750 | if (mem_cgroup_move_parent(page, pc, memcg)) { | 3762 | if (mem_cgroup_move_parent(page, pc, memcg)) { |
3751 | /* found lock contention or "pc" is obsolete. */ | 3763 | /* found lock contention or "pc" is obsolete. */ |
3752 | busy = page; | 3764 | busy = page; |
3753 | } else | 3765 | } else |
3754 | busy = NULL; | 3766 | busy = NULL; |
3755 | cond_resched(); | 3767 | cond_resched(); |
3756 | } while (!list_empty(list)); | 3768 | } while (!list_empty(list)); |
3757 | } | 3769 | } |
3758 | 3770 | ||
3759 | /* | 3771 | /* |
3760 | * make mem_cgroup's charge to be 0 if there is no task by moving | 3772 | * make mem_cgroup's charge to be 0 if there is no task by moving |
3761 | * all the charges and pages to the parent. | 3773 | * all the charges and pages to the parent. |
3762 | * This enables deleting this mem_cgroup. | 3774 | * This enables deleting this mem_cgroup. |
3763 | * | 3775 | * |
3764 | * Caller is responsible for holding css reference on the memcg. | 3776 | * Caller is responsible for holding css reference on the memcg. |
3765 | */ | 3777 | */ |
3766 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | 3778 | static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) |
3767 | { | 3779 | { |
3768 | int node, zid; | 3780 | int node, zid; |
3769 | 3781 | ||
3770 | do { | 3782 | do { |
3771 | /* This is for making all *used* pages to be on LRU. */ | 3783 | /* This is for making all *used* pages to be on LRU. */ |
3772 | lru_add_drain_all(); | 3784 | lru_add_drain_all(); |
3773 | drain_all_stock_sync(memcg); | 3785 | drain_all_stock_sync(memcg); |
3774 | mem_cgroup_start_move(memcg); | 3786 | mem_cgroup_start_move(memcg); |
3775 | for_each_node_state(node, N_MEMORY) { | 3787 | for_each_node_state(node, N_MEMORY) { |
3776 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 3788 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
3777 | enum lru_list lru; | 3789 | enum lru_list lru; |
3778 | for_each_lru(lru) { | 3790 | for_each_lru(lru) { |
3779 | mem_cgroup_force_empty_list(memcg, | 3791 | mem_cgroup_force_empty_list(memcg, |
3780 | node, zid, lru); | 3792 | node, zid, lru); |
3781 | } | 3793 | } |
3782 | } | 3794 | } |
3783 | } | 3795 | } |
3784 | mem_cgroup_end_move(memcg); | 3796 | mem_cgroup_end_move(memcg); |
3785 | memcg_oom_recover(memcg); | 3797 | memcg_oom_recover(memcg); |
3786 | cond_resched(); | 3798 | cond_resched(); |
3787 | 3799 | ||
3788 | /* | 3800 | /* |
3789 | * Kernel memory may not necessarily be trackable to a specific | 3801 | * Kernel memory may not necessarily be trackable to a specific |
3790 | * process. So they are not migrated, and therefore we can't | 3802 | * process. So they are not migrated, and therefore we can't |
3791 | * expect their value to drop to 0 here. | 3803 | * expect their value to drop to 0 here. |
3792 | * Having res filled up with kmem only is enough. | 3804 | * Having res filled up with kmem only is enough. |
3793 | * | 3805 | * |
3794 | * This is a safety check because mem_cgroup_force_empty_list | 3806 | * This is a safety check because mem_cgroup_force_empty_list |
3795 | * could have raced with mem_cgroup_replace_page_cache callers | 3807 | * could have raced with mem_cgroup_replace_page_cache callers |
3796 | * so the lru seemed empty but the page could have been added | 3808 | * so the lru seemed empty but the page could have been added |
3797 | * right after the check. RES_USAGE should be safe as we always | 3809 | * right after the check. RES_USAGE should be safe as we always |
3798 | * charge before adding to the LRU. | 3810 | * charge before adding to the LRU. |
3799 | */ | 3811 | */ |
3800 | } while (page_counter_read(&memcg->memory) - | 3812 | } while (page_counter_read(&memcg->memory) - |
3801 | page_counter_read(&memcg->kmem) > 0); | 3813 | page_counter_read(&memcg->kmem) > 0); |
3802 | } | 3814 | } |
3803 | 3815 | ||
3804 | /* | 3816 | /* |
3805 | * Test whether @memcg has children, dead or alive. Note that this | 3817 | * Test whether @memcg has children, dead or alive. Note that this |
3806 | * function doesn't care whether @memcg has use_hierarchy enabled and | 3818 | * function doesn't care whether @memcg has use_hierarchy enabled and |
3807 | * returns %true if there are child csses according to the cgroup | 3819 | * returns %true if there are child csses according to the cgroup |
3808 | * hierarchy. Testing use_hierarchy is the caller's responsiblity. | 3820 | * hierarchy. Testing use_hierarchy is the caller's responsiblity. |
3809 | */ | 3821 | */ |
3810 | static inline bool memcg_has_children(struct mem_cgroup *memcg) | 3822 | static inline bool memcg_has_children(struct mem_cgroup *memcg) |
3811 | { | 3823 | { |
3812 | bool ret; | 3824 | bool ret; |
3813 | 3825 | ||
3814 | /* | 3826 | /* |
3815 | * The lock does not prevent addition or deletion of children, but | 3827 | * The lock does not prevent addition or deletion of children, but |
3816 | * it prevents a new child from being initialized based on this | 3828 | * it prevents a new child from being initialized based on this |
3817 | * parent in css_online(), so it's enough to decide whether | 3829 | * parent in css_online(), so it's enough to decide whether |
3818 | * hierarchically inherited attributes can still be changed or not. | 3830 | * hierarchically inherited attributes can still be changed or not. |
3819 | */ | 3831 | */ |
3820 | lockdep_assert_held(&memcg_create_mutex); | 3832 | lockdep_assert_held(&memcg_create_mutex); |
3821 | 3833 | ||
3822 | rcu_read_lock(); | 3834 | rcu_read_lock(); |
3823 | ret = css_next_child(NULL, &memcg->css); | 3835 | ret = css_next_child(NULL, &memcg->css); |
3824 | rcu_read_unlock(); | 3836 | rcu_read_unlock(); |
3825 | return ret; | 3837 | return ret; |
3826 | } | 3838 | } |
3827 | 3839 | ||
3828 | /* | 3840 | /* |
3829 | * Reclaims as many pages from the given memcg as possible and moves | 3841 | * Reclaims as many pages from the given memcg as possible and moves |
3830 | * the rest to the parent. | 3842 | * the rest to the parent. |
3831 | * | 3843 | * |
3832 | * Caller is responsible for holding css reference for memcg. | 3844 | * Caller is responsible for holding css reference for memcg. |
3833 | */ | 3845 | */ |
3834 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | 3846 | static int mem_cgroup_force_empty(struct mem_cgroup *memcg) |
3835 | { | 3847 | { |
3836 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 3848 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
3837 | 3849 | ||
3838 | /* we call try-to-free pages for make this cgroup empty */ | 3850 | /* we call try-to-free pages for make this cgroup empty */ |
3839 | lru_add_drain_all(); | 3851 | lru_add_drain_all(); |
3840 | /* try to free all pages in this cgroup */ | 3852 | /* try to free all pages in this cgroup */ |
3841 | while (nr_retries && page_counter_read(&memcg->memory)) { | 3853 | while (nr_retries && page_counter_read(&memcg->memory)) { |
3842 | int progress; | 3854 | int progress; |
3843 | 3855 | ||
3844 | if (signal_pending(current)) | 3856 | if (signal_pending(current)) |
3845 | return -EINTR; | 3857 | return -EINTR; |
3846 | 3858 | ||
3847 | progress = try_to_free_mem_cgroup_pages(memcg, 1, | 3859 | progress = try_to_free_mem_cgroup_pages(memcg, 1, |
3848 | GFP_KERNEL, true); | 3860 | GFP_KERNEL, true); |
3849 | if (!progress) { | 3861 | if (!progress) { |
3850 | nr_retries--; | 3862 | nr_retries--; |
3851 | /* maybe some writeback is necessary */ | 3863 | /* maybe some writeback is necessary */ |
3852 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 3864 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
3853 | } | 3865 | } |
3854 | 3866 | ||
3855 | } | 3867 | } |
3856 | 3868 | ||
3857 | return 0; | 3869 | return 0; |
3858 | } | 3870 | } |
3859 | 3871 | ||
3860 | static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, | 3872 | static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, |
3861 | char *buf, size_t nbytes, | 3873 | char *buf, size_t nbytes, |
3862 | loff_t off) | 3874 | loff_t off) |
3863 | { | 3875 | { |
3864 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 3876 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
3865 | 3877 | ||
3866 | if (mem_cgroup_is_root(memcg)) | 3878 | if (mem_cgroup_is_root(memcg)) |
3867 | return -EINVAL; | 3879 | return -EINVAL; |
3868 | return mem_cgroup_force_empty(memcg) ?: nbytes; | 3880 | return mem_cgroup_force_empty(memcg) ?: nbytes; |
3869 | } | 3881 | } |
3870 | 3882 | ||
3871 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, | 3883 | static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, |
3872 | struct cftype *cft) | 3884 | struct cftype *cft) |
3873 | { | 3885 | { |
3874 | return mem_cgroup_from_css(css)->use_hierarchy; | 3886 | return mem_cgroup_from_css(css)->use_hierarchy; |
3875 | } | 3887 | } |
3876 | 3888 | ||
3877 | static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, | 3889 | static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, |
3878 | struct cftype *cft, u64 val) | 3890 | struct cftype *cft, u64 val) |
3879 | { | 3891 | { |
3880 | int retval = 0; | 3892 | int retval = 0; |
3881 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3893 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
3882 | struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); | 3894 | struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); |
3883 | 3895 | ||
3884 | mutex_lock(&memcg_create_mutex); | 3896 | mutex_lock(&memcg_create_mutex); |
3885 | 3897 | ||
3886 | if (memcg->use_hierarchy == val) | 3898 | if (memcg->use_hierarchy == val) |
3887 | goto out; | 3899 | goto out; |
3888 | 3900 | ||
3889 | /* | 3901 | /* |
3890 | * If parent's use_hierarchy is set, we can't make any modifications | 3902 | * If parent's use_hierarchy is set, we can't make any modifications |
3891 | * in the child subtrees. If it is unset, then the change can | 3903 | * in the child subtrees. If it is unset, then the change can |
3892 | * occur, provided the current cgroup has no children. | 3904 | * occur, provided the current cgroup has no children. |
3893 | * | 3905 | * |
3894 | * For the root cgroup, parent_mem is NULL, we allow value to be | 3906 | * For the root cgroup, parent_mem is NULL, we allow value to be |
3895 | * set if there are no children. | 3907 | * set if there are no children. |
3896 | */ | 3908 | */ |
3897 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && | 3909 | if ((!parent_memcg || !parent_memcg->use_hierarchy) && |
3898 | (val == 1 || val == 0)) { | 3910 | (val == 1 || val == 0)) { |
3899 | if (!memcg_has_children(memcg)) | 3911 | if (!memcg_has_children(memcg)) |
3900 | memcg->use_hierarchy = val; | 3912 | memcg->use_hierarchy = val; |
3901 | else | 3913 | else |
3902 | retval = -EBUSY; | 3914 | retval = -EBUSY; |
3903 | } else | 3915 | } else |
3904 | retval = -EINVAL; | 3916 | retval = -EINVAL; |
3905 | 3917 | ||
3906 | out: | 3918 | out: |
3907 | mutex_unlock(&memcg_create_mutex); | 3919 | mutex_unlock(&memcg_create_mutex); |
3908 | 3920 | ||
3909 | return retval; | 3921 | return retval; |
3910 | } | 3922 | } |
3911 | 3923 | ||
3912 | static unsigned long tree_stat(struct mem_cgroup *memcg, | 3924 | static unsigned long tree_stat(struct mem_cgroup *memcg, |
3913 | enum mem_cgroup_stat_index idx) | 3925 | enum mem_cgroup_stat_index idx) |
3914 | { | 3926 | { |
3915 | struct mem_cgroup *iter; | 3927 | struct mem_cgroup *iter; |
3916 | long val = 0; | 3928 | long val = 0; |
3917 | 3929 | ||
3918 | /* Per-cpu values can be negative, use a signed accumulator */ | 3930 | /* Per-cpu values can be negative, use a signed accumulator */ |
3919 | for_each_mem_cgroup_tree(iter, memcg) | 3931 | for_each_mem_cgroup_tree(iter, memcg) |
3920 | val += mem_cgroup_read_stat(iter, idx); | 3932 | val += mem_cgroup_read_stat(iter, idx); |
3921 | 3933 | ||
3922 | if (val < 0) /* race ? */ | 3934 | if (val < 0) /* race ? */ |
3923 | val = 0; | 3935 | val = 0; |
3924 | return val; | 3936 | return val; |
3925 | } | 3937 | } |
3926 | 3938 | ||
3927 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) | 3939 | static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) |
3928 | { | 3940 | { |
3929 | u64 val; | 3941 | u64 val; |
3930 | 3942 | ||
3931 | if (mem_cgroup_is_root(memcg)) { | 3943 | if (mem_cgroup_is_root(memcg)) { |
3932 | val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); | 3944 | val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); |
3933 | val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); | 3945 | val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); |
3934 | if (swap) | 3946 | if (swap) |
3935 | val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); | 3947 | val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); |
3936 | } else { | 3948 | } else { |
3937 | if (!swap) | 3949 | if (!swap) |
3938 | val = page_counter_read(&memcg->memory); | 3950 | val = page_counter_read(&memcg->memory); |
3939 | else | 3951 | else |
3940 | val = page_counter_read(&memcg->memsw); | 3952 | val = page_counter_read(&memcg->memsw); |
3941 | } | 3953 | } |
3942 | return val << PAGE_SHIFT; | 3954 | return val << PAGE_SHIFT; |
3943 | } | 3955 | } |
3944 | 3956 | ||
3945 | enum { | 3957 | enum { |
3946 | RES_USAGE, | 3958 | RES_USAGE, |
3947 | RES_LIMIT, | 3959 | RES_LIMIT, |
3948 | RES_MAX_USAGE, | 3960 | RES_MAX_USAGE, |
3949 | RES_FAILCNT, | 3961 | RES_FAILCNT, |
3950 | RES_SOFT_LIMIT, | 3962 | RES_SOFT_LIMIT, |
3951 | }; | 3963 | }; |
3952 | 3964 | ||
3953 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, | 3965 | static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, |
3954 | struct cftype *cft) | 3966 | struct cftype *cft) |
3955 | { | 3967 | { |
3956 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 3968 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
3957 | struct page_counter *counter; | 3969 | struct page_counter *counter; |
3958 | 3970 | ||
3959 | switch (MEMFILE_TYPE(cft->private)) { | 3971 | switch (MEMFILE_TYPE(cft->private)) { |
3960 | case _MEM: | 3972 | case _MEM: |
3961 | counter = &memcg->memory; | 3973 | counter = &memcg->memory; |
3962 | break; | 3974 | break; |
3963 | case _MEMSWAP: | 3975 | case _MEMSWAP: |
3964 | counter = &memcg->memsw; | 3976 | counter = &memcg->memsw; |
3965 | break; | 3977 | break; |
3966 | case _KMEM: | 3978 | case _KMEM: |
3967 | counter = &memcg->kmem; | 3979 | counter = &memcg->kmem; |
3968 | break; | 3980 | break; |
3969 | default: | 3981 | default: |
3970 | BUG(); | 3982 | BUG(); |
3971 | } | 3983 | } |
3972 | 3984 | ||
3973 | switch (MEMFILE_ATTR(cft->private)) { | 3985 | switch (MEMFILE_ATTR(cft->private)) { |
3974 | case RES_USAGE: | 3986 | case RES_USAGE: |
3975 | if (counter == &memcg->memory) | 3987 | if (counter == &memcg->memory) |
3976 | return mem_cgroup_usage(memcg, false); | 3988 | return mem_cgroup_usage(memcg, false); |
3977 | if (counter == &memcg->memsw) | 3989 | if (counter == &memcg->memsw) |
3978 | return mem_cgroup_usage(memcg, true); | 3990 | return mem_cgroup_usage(memcg, true); |
3979 | return (u64)page_counter_read(counter) * PAGE_SIZE; | 3991 | return (u64)page_counter_read(counter) * PAGE_SIZE; |
3980 | case RES_LIMIT: | 3992 | case RES_LIMIT: |
3981 | return (u64)counter->limit * PAGE_SIZE; | 3993 | return (u64)counter->limit * PAGE_SIZE; |
3982 | case RES_MAX_USAGE: | 3994 | case RES_MAX_USAGE: |
3983 | return (u64)counter->watermark * PAGE_SIZE; | 3995 | return (u64)counter->watermark * PAGE_SIZE; |
3984 | case RES_FAILCNT: | 3996 | case RES_FAILCNT: |
3985 | return counter->failcnt; | 3997 | return counter->failcnt; |
3986 | case RES_SOFT_LIMIT: | 3998 | case RES_SOFT_LIMIT: |
3987 | return (u64)memcg->soft_limit * PAGE_SIZE; | 3999 | return (u64)memcg->soft_limit * PAGE_SIZE; |
3988 | default: | 4000 | default: |
3989 | BUG(); | 4001 | BUG(); |
3990 | } | 4002 | } |
3991 | } | 4003 | } |
3992 | 4004 | ||
3993 | #ifdef CONFIG_MEMCG_KMEM | 4005 | #ifdef CONFIG_MEMCG_KMEM |
3994 | /* should be called with activate_kmem_mutex held */ | 4006 | /* should be called with activate_kmem_mutex held */ |
3995 | static int __memcg_activate_kmem(struct mem_cgroup *memcg, | 4007 | static int __memcg_activate_kmem(struct mem_cgroup *memcg, |
3996 | unsigned long nr_pages) | 4008 | unsigned long nr_pages) |
3997 | { | 4009 | { |
3998 | int err = 0; | 4010 | int err = 0; |
3999 | int memcg_id; | 4011 | int memcg_id; |
4000 | 4012 | ||
4001 | if (memcg_kmem_is_active(memcg)) | 4013 | if (memcg_kmem_is_active(memcg)) |
4002 | return 0; | 4014 | return 0; |
4003 | 4015 | ||
4004 | /* | 4016 | /* |
4005 | * We are going to allocate memory for data shared by all memory | 4017 | * We are going to allocate memory for data shared by all memory |
4006 | * cgroups so let's stop accounting here. | 4018 | * cgroups so let's stop accounting here. |
4007 | */ | 4019 | */ |
4008 | memcg_stop_kmem_account(); | 4020 | memcg_stop_kmem_account(); |
4009 | 4021 | ||
4010 | /* | 4022 | /* |
4011 | * For simplicity, we won't allow this to be disabled. It also can't | 4023 | * For simplicity, we won't allow this to be disabled. It also can't |
4012 | * be changed if the cgroup has children already, or if tasks had | 4024 | * be changed if the cgroup has children already, or if tasks had |
4013 | * already joined. | 4025 | * already joined. |
4014 | * | 4026 | * |
4015 | * If tasks join before we set the limit, a person looking at | 4027 | * If tasks join before we set the limit, a person looking at |
4016 | * kmem.usage_in_bytes will have no way to determine when it took | 4028 | * kmem.usage_in_bytes will have no way to determine when it took |
4017 | * place, which makes the value quite meaningless. | 4029 | * place, which makes the value quite meaningless. |
4018 | * | 4030 | * |
4019 | * After it first became limited, changes in the value of the limit are | 4031 | * After it first became limited, changes in the value of the limit are |
4020 | * of course permitted. | 4032 | * of course permitted. |
4021 | */ | 4033 | */ |
4022 | mutex_lock(&memcg_create_mutex); | 4034 | mutex_lock(&memcg_create_mutex); |
4023 | if (cgroup_has_tasks(memcg->css.cgroup) || | 4035 | if (cgroup_has_tasks(memcg->css.cgroup) || |
4024 | (memcg->use_hierarchy && memcg_has_children(memcg))) | 4036 | (memcg->use_hierarchy && memcg_has_children(memcg))) |
4025 | err = -EBUSY; | 4037 | err = -EBUSY; |
4026 | mutex_unlock(&memcg_create_mutex); | 4038 | mutex_unlock(&memcg_create_mutex); |
4027 | if (err) | 4039 | if (err) |
4028 | goto out; | 4040 | goto out; |
4029 | 4041 | ||
4030 | memcg_id = memcg_alloc_cache_id(); | 4042 | memcg_id = memcg_alloc_cache_id(); |
4031 | if (memcg_id < 0) { | 4043 | if (memcg_id < 0) { |
4032 | err = memcg_id; | 4044 | err = memcg_id; |
4033 | goto out; | 4045 | goto out; |
4034 | } | 4046 | } |
4035 | 4047 | ||
4036 | memcg->kmemcg_id = memcg_id; | 4048 | memcg->kmemcg_id = memcg_id; |
4037 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | 4049 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); |
4038 | 4050 | ||
4039 | /* | 4051 | /* |
4040 | * We couldn't have accounted to this cgroup, because it hasn't got the | 4052 | * We couldn't have accounted to this cgroup, because it hasn't got the |
4041 | * active bit set yet, so this should succeed. | 4053 | * active bit set yet, so this should succeed. |
4042 | */ | 4054 | */ |
4043 | err = page_counter_limit(&memcg->kmem, nr_pages); | 4055 | err = page_counter_limit(&memcg->kmem, nr_pages); |
4044 | VM_BUG_ON(err); | 4056 | VM_BUG_ON(err); |
4045 | 4057 | ||
4046 | static_key_slow_inc(&memcg_kmem_enabled_key); | 4058 | static_key_slow_inc(&memcg_kmem_enabled_key); |
4047 | /* | 4059 | /* |
4048 | * Setting the active bit after enabling static branching will | 4060 | * Setting the active bit after enabling static branching will |
4049 | * guarantee no one starts accounting before all call sites are | 4061 | * guarantee no one starts accounting before all call sites are |
4050 | * patched. | 4062 | * patched. |
4051 | */ | 4063 | */ |
4052 | memcg_kmem_set_active(memcg); | 4064 | memcg_kmem_set_active(memcg); |
4053 | out: | 4065 | out: |
4054 | memcg_resume_kmem_account(); | 4066 | memcg_resume_kmem_account(); |
4055 | return err; | 4067 | return err; |
4056 | } | 4068 | } |
4057 | 4069 | ||
4058 | static int memcg_activate_kmem(struct mem_cgroup *memcg, | 4070 | static int memcg_activate_kmem(struct mem_cgroup *memcg, |
4059 | unsigned long nr_pages) | 4071 | unsigned long nr_pages) |
4060 | { | 4072 | { |
4061 | int ret; | 4073 | int ret; |
4062 | 4074 | ||
4063 | mutex_lock(&activate_kmem_mutex); | 4075 | mutex_lock(&activate_kmem_mutex); |
4064 | ret = __memcg_activate_kmem(memcg, nr_pages); | 4076 | ret = __memcg_activate_kmem(memcg, nr_pages); |
4065 | mutex_unlock(&activate_kmem_mutex); | 4077 | mutex_unlock(&activate_kmem_mutex); |
4066 | return ret; | 4078 | return ret; |
4067 | } | 4079 | } |
4068 | 4080 | ||
4069 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | 4081 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, |
4070 | unsigned long limit) | 4082 | unsigned long limit) |
4071 | { | 4083 | { |
4072 | int ret; | 4084 | int ret; |
4073 | 4085 | ||
4074 | mutex_lock(&memcg_limit_mutex); | 4086 | mutex_lock(&memcg_limit_mutex); |
4075 | if (!memcg_kmem_is_active(memcg)) | 4087 | if (!memcg_kmem_is_active(memcg)) |
4076 | ret = memcg_activate_kmem(memcg, limit); | 4088 | ret = memcg_activate_kmem(memcg, limit); |
4077 | else | 4089 | else |
4078 | ret = page_counter_limit(&memcg->kmem, limit); | 4090 | ret = page_counter_limit(&memcg->kmem, limit); |
4079 | mutex_unlock(&memcg_limit_mutex); | 4091 | mutex_unlock(&memcg_limit_mutex); |
4080 | return ret; | 4092 | return ret; |
4081 | } | 4093 | } |
4082 | 4094 | ||
4083 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) | 4095 | static int memcg_propagate_kmem(struct mem_cgroup *memcg) |
4084 | { | 4096 | { |
4085 | int ret = 0; | 4097 | int ret = 0; |
4086 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); | 4098 | struct mem_cgroup *parent = parent_mem_cgroup(memcg); |
4087 | 4099 | ||
4088 | if (!parent) | 4100 | if (!parent) |
4089 | return 0; | 4101 | return 0; |
4090 | 4102 | ||
4091 | mutex_lock(&activate_kmem_mutex); | 4103 | mutex_lock(&activate_kmem_mutex); |
4092 | /* | 4104 | /* |
4093 | * If the parent cgroup is not kmem-active now, it cannot be activated | 4105 | * If the parent cgroup is not kmem-active now, it cannot be activated |
4094 | * after this point, because it has at least one child already. | 4106 | * after this point, because it has at least one child already. |
4095 | */ | 4107 | */ |
4096 | if (memcg_kmem_is_active(parent)) | 4108 | if (memcg_kmem_is_active(parent)) |
4097 | ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); | 4109 | ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX); |
4098 | mutex_unlock(&activate_kmem_mutex); | 4110 | mutex_unlock(&activate_kmem_mutex); |
4099 | return ret; | 4111 | return ret; |
4100 | } | 4112 | } |
4101 | #else | 4113 | #else |
4102 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, | 4114 | static int memcg_update_kmem_limit(struct mem_cgroup *memcg, |
4103 | unsigned long limit) | 4115 | unsigned long limit) |
4104 | { | 4116 | { |
4105 | return -EINVAL; | 4117 | return -EINVAL; |
4106 | } | 4118 | } |
4107 | #endif /* CONFIG_MEMCG_KMEM */ | 4119 | #endif /* CONFIG_MEMCG_KMEM */ |
4108 | 4120 | ||
4109 | /* | 4121 | /* |
4110 | * The user of this function is... | 4122 | * The user of this function is... |
4111 | * RES_LIMIT. | 4123 | * RES_LIMIT. |
4112 | */ | 4124 | */ |
4113 | static ssize_t mem_cgroup_write(struct kernfs_open_file *of, | 4125 | static ssize_t mem_cgroup_write(struct kernfs_open_file *of, |
4114 | char *buf, size_t nbytes, loff_t off) | 4126 | char *buf, size_t nbytes, loff_t off) |
4115 | { | 4127 | { |
4116 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 4128 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
4117 | unsigned long nr_pages; | 4129 | unsigned long nr_pages; |
4118 | int ret; | 4130 | int ret; |
4119 | 4131 | ||
4120 | buf = strstrip(buf); | 4132 | buf = strstrip(buf); |
4121 | ret = page_counter_memparse(buf, &nr_pages); | 4133 | ret = page_counter_memparse(buf, &nr_pages); |
4122 | if (ret) | 4134 | if (ret) |
4123 | return ret; | 4135 | return ret; |
4124 | 4136 | ||
4125 | switch (MEMFILE_ATTR(of_cft(of)->private)) { | 4137 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
4126 | case RES_LIMIT: | 4138 | case RES_LIMIT: |
4127 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | 4139 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ |
4128 | ret = -EINVAL; | 4140 | ret = -EINVAL; |
4129 | break; | 4141 | break; |
4130 | } | 4142 | } |
4131 | switch (MEMFILE_TYPE(of_cft(of)->private)) { | 4143 | switch (MEMFILE_TYPE(of_cft(of)->private)) { |
4132 | case _MEM: | 4144 | case _MEM: |
4133 | ret = mem_cgroup_resize_limit(memcg, nr_pages); | 4145 | ret = mem_cgroup_resize_limit(memcg, nr_pages); |
4134 | break; | 4146 | break; |
4135 | case _MEMSWAP: | 4147 | case _MEMSWAP: |
4136 | ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); | 4148 | ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); |
4137 | break; | 4149 | break; |
4138 | case _KMEM: | 4150 | case _KMEM: |
4139 | ret = memcg_update_kmem_limit(memcg, nr_pages); | 4151 | ret = memcg_update_kmem_limit(memcg, nr_pages); |
4140 | break; | 4152 | break; |
4141 | } | 4153 | } |
4142 | break; | 4154 | break; |
4143 | case RES_SOFT_LIMIT: | 4155 | case RES_SOFT_LIMIT: |
4144 | memcg->soft_limit = nr_pages; | 4156 | memcg->soft_limit = nr_pages; |
4145 | ret = 0; | 4157 | ret = 0; |
4146 | break; | 4158 | break; |
4147 | } | 4159 | } |
4148 | return ret ?: nbytes; | 4160 | return ret ?: nbytes; |
4149 | } | 4161 | } |
4150 | 4162 | ||
4151 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, | 4163 | static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, |
4152 | size_t nbytes, loff_t off) | 4164 | size_t nbytes, loff_t off) |
4153 | { | 4165 | { |
4154 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); | 4166 | struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); |
4155 | struct page_counter *counter; | 4167 | struct page_counter *counter; |
4156 | 4168 | ||
4157 | switch (MEMFILE_TYPE(of_cft(of)->private)) { | 4169 | switch (MEMFILE_TYPE(of_cft(of)->private)) { |
4158 | case _MEM: | 4170 | case _MEM: |
4159 | counter = &memcg->memory; | 4171 | counter = &memcg->memory; |
4160 | break; | 4172 | break; |
4161 | case _MEMSWAP: | 4173 | case _MEMSWAP: |
4162 | counter = &memcg->memsw; | 4174 | counter = &memcg->memsw; |
4163 | break; | 4175 | break; |
4164 | case _KMEM: | 4176 | case _KMEM: |
4165 | counter = &memcg->kmem; | 4177 | counter = &memcg->kmem; |
4166 | break; | 4178 | break; |
4167 | default: | 4179 | default: |
4168 | BUG(); | 4180 | BUG(); |
4169 | } | 4181 | } |
4170 | 4182 | ||
4171 | switch (MEMFILE_ATTR(of_cft(of)->private)) { | 4183 | switch (MEMFILE_ATTR(of_cft(of)->private)) { |
4172 | case RES_MAX_USAGE: | 4184 | case RES_MAX_USAGE: |
4173 | page_counter_reset_watermark(counter); | 4185 | page_counter_reset_watermark(counter); |
4174 | break; | 4186 | break; |
4175 | case RES_FAILCNT: | 4187 | case RES_FAILCNT: |
4176 | counter->failcnt = 0; | 4188 | counter->failcnt = 0; |
4177 | break; | 4189 | break; |
4178 | default: | 4190 | default: |
4179 | BUG(); | 4191 | BUG(); |
4180 | } | 4192 | } |
4181 | 4193 | ||
4182 | return nbytes; | 4194 | return nbytes; |
4183 | } | 4195 | } |
4184 | 4196 | ||
4185 | static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, | 4197 | static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, |
4186 | struct cftype *cft) | 4198 | struct cftype *cft) |
4187 | { | 4199 | { |
4188 | return mem_cgroup_from_css(css)->move_charge_at_immigrate; | 4200 | return mem_cgroup_from_css(css)->move_charge_at_immigrate; |
4189 | } | 4201 | } |
4190 | 4202 | ||
4191 | #ifdef CONFIG_MMU | 4203 | #ifdef CONFIG_MMU |
4192 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | 4204 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, |
4193 | struct cftype *cft, u64 val) | 4205 | struct cftype *cft, u64 val) |
4194 | { | 4206 | { |
4195 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4207 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4196 | 4208 | ||
4197 | if (val >= (1 << NR_MOVE_TYPE)) | 4209 | if (val >= (1 << NR_MOVE_TYPE)) |
4198 | return -EINVAL; | 4210 | return -EINVAL; |
4199 | 4211 | ||
4200 | /* | 4212 | /* |
4201 | * No kind of locking is needed in here, because ->can_attach() will | 4213 | * No kind of locking is needed in here, because ->can_attach() will |
4202 | * check this value once in the beginning of the process, and then carry | 4214 | * check this value once in the beginning of the process, and then carry |
4203 | * on with stale data. This means that changes to this value will only | 4215 | * on with stale data. This means that changes to this value will only |
4204 | * affect task migrations starting after the change. | 4216 | * affect task migrations starting after the change. |
4205 | */ | 4217 | */ |
4206 | memcg->move_charge_at_immigrate = val; | 4218 | memcg->move_charge_at_immigrate = val; |
4207 | return 0; | 4219 | return 0; |
4208 | } | 4220 | } |
4209 | #else | 4221 | #else |
4210 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, | 4222 | static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, |
4211 | struct cftype *cft, u64 val) | 4223 | struct cftype *cft, u64 val) |
4212 | { | 4224 | { |
4213 | return -ENOSYS; | 4225 | return -ENOSYS; |
4214 | } | 4226 | } |
4215 | #endif | 4227 | #endif |
4216 | 4228 | ||
4217 | #ifdef CONFIG_NUMA | 4229 | #ifdef CONFIG_NUMA |
4218 | static int memcg_numa_stat_show(struct seq_file *m, void *v) | 4230 | static int memcg_numa_stat_show(struct seq_file *m, void *v) |
4219 | { | 4231 | { |
4220 | struct numa_stat { | 4232 | struct numa_stat { |
4221 | const char *name; | 4233 | const char *name; |
4222 | unsigned int lru_mask; | 4234 | unsigned int lru_mask; |
4223 | }; | 4235 | }; |
4224 | 4236 | ||
4225 | static const struct numa_stat stats[] = { | 4237 | static const struct numa_stat stats[] = { |
4226 | { "total", LRU_ALL }, | 4238 | { "total", LRU_ALL }, |
4227 | { "file", LRU_ALL_FILE }, | 4239 | { "file", LRU_ALL_FILE }, |
4228 | { "anon", LRU_ALL_ANON }, | 4240 | { "anon", LRU_ALL_ANON }, |
4229 | { "unevictable", BIT(LRU_UNEVICTABLE) }, | 4241 | { "unevictable", BIT(LRU_UNEVICTABLE) }, |
4230 | }; | 4242 | }; |
4231 | const struct numa_stat *stat; | 4243 | const struct numa_stat *stat; |
4232 | int nid; | 4244 | int nid; |
4233 | unsigned long nr; | 4245 | unsigned long nr; |
4234 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 4246 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
4235 | 4247 | ||
4236 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | 4248 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
4237 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); | 4249 | nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); |
4238 | seq_printf(m, "%s=%lu", stat->name, nr); | 4250 | seq_printf(m, "%s=%lu", stat->name, nr); |
4239 | for_each_node_state(nid, N_MEMORY) { | 4251 | for_each_node_state(nid, N_MEMORY) { |
4240 | nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 4252 | nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
4241 | stat->lru_mask); | 4253 | stat->lru_mask); |
4242 | seq_printf(m, " N%d=%lu", nid, nr); | 4254 | seq_printf(m, " N%d=%lu", nid, nr); |
4243 | } | 4255 | } |
4244 | seq_putc(m, '\n'); | 4256 | seq_putc(m, '\n'); |
4245 | } | 4257 | } |
4246 | 4258 | ||
4247 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { | 4259 | for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { |
4248 | struct mem_cgroup *iter; | 4260 | struct mem_cgroup *iter; |
4249 | 4261 | ||
4250 | nr = 0; | 4262 | nr = 0; |
4251 | for_each_mem_cgroup_tree(iter, memcg) | 4263 | for_each_mem_cgroup_tree(iter, memcg) |
4252 | nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); | 4264 | nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); |
4253 | seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); | 4265 | seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); |
4254 | for_each_node_state(nid, N_MEMORY) { | 4266 | for_each_node_state(nid, N_MEMORY) { |
4255 | nr = 0; | 4267 | nr = 0; |
4256 | for_each_mem_cgroup_tree(iter, memcg) | 4268 | for_each_mem_cgroup_tree(iter, memcg) |
4257 | nr += mem_cgroup_node_nr_lru_pages( | 4269 | nr += mem_cgroup_node_nr_lru_pages( |
4258 | iter, nid, stat->lru_mask); | 4270 | iter, nid, stat->lru_mask); |
4259 | seq_printf(m, " N%d=%lu", nid, nr); | 4271 | seq_printf(m, " N%d=%lu", nid, nr); |
4260 | } | 4272 | } |
4261 | seq_putc(m, '\n'); | 4273 | seq_putc(m, '\n'); |
4262 | } | 4274 | } |
4263 | 4275 | ||
4264 | return 0; | 4276 | return 0; |
4265 | } | 4277 | } |
4266 | #endif /* CONFIG_NUMA */ | 4278 | #endif /* CONFIG_NUMA */ |
4267 | 4279 | ||
4268 | static inline void mem_cgroup_lru_names_not_uptodate(void) | 4280 | static inline void mem_cgroup_lru_names_not_uptodate(void) |
4269 | { | 4281 | { |
4270 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); | 4282 | BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); |
4271 | } | 4283 | } |
4272 | 4284 | ||
4273 | static int memcg_stat_show(struct seq_file *m, void *v) | 4285 | static int memcg_stat_show(struct seq_file *m, void *v) |
4274 | { | 4286 | { |
4275 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 4287 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
4276 | unsigned long memory, memsw; | 4288 | unsigned long memory, memsw; |
4277 | struct mem_cgroup *mi; | 4289 | struct mem_cgroup *mi; |
4278 | unsigned int i; | 4290 | unsigned int i; |
4279 | 4291 | ||
4280 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4292 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4281 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | 4293 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4282 | continue; | 4294 | continue; |
4283 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], | 4295 | seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], |
4284 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); | 4296 | mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); |
4285 | } | 4297 | } |
4286 | 4298 | ||
4287 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) | 4299 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) |
4288 | seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], | 4300 | seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], |
4289 | mem_cgroup_read_events(memcg, i)); | 4301 | mem_cgroup_read_events(memcg, i)); |
4290 | 4302 | ||
4291 | for (i = 0; i < NR_LRU_LISTS; i++) | 4303 | for (i = 0; i < NR_LRU_LISTS; i++) |
4292 | seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], | 4304 | seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], |
4293 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); | 4305 | mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); |
4294 | 4306 | ||
4295 | /* Hierarchical information */ | 4307 | /* Hierarchical information */ |
4296 | memory = memsw = PAGE_COUNTER_MAX; | 4308 | memory = memsw = PAGE_COUNTER_MAX; |
4297 | for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { | 4309 | for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { |
4298 | memory = min(memory, mi->memory.limit); | 4310 | memory = min(memory, mi->memory.limit); |
4299 | memsw = min(memsw, mi->memsw.limit); | 4311 | memsw = min(memsw, mi->memsw.limit); |
4300 | } | 4312 | } |
4301 | seq_printf(m, "hierarchical_memory_limit %llu\n", | 4313 | seq_printf(m, "hierarchical_memory_limit %llu\n", |
4302 | (u64)memory * PAGE_SIZE); | 4314 | (u64)memory * PAGE_SIZE); |
4303 | if (do_swap_account) | 4315 | if (do_swap_account) |
4304 | seq_printf(m, "hierarchical_memsw_limit %llu\n", | 4316 | seq_printf(m, "hierarchical_memsw_limit %llu\n", |
4305 | (u64)memsw * PAGE_SIZE); | 4317 | (u64)memsw * PAGE_SIZE); |
4306 | 4318 | ||
4307 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { | 4319 | for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { |
4308 | long long val = 0; | 4320 | long long val = 0; |
4309 | 4321 | ||
4310 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) | 4322 | if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) |
4311 | continue; | 4323 | continue; |
4312 | for_each_mem_cgroup_tree(mi, memcg) | 4324 | for_each_mem_cgroup_tree(mi, memcg) |
4313 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; | 4325 | val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; |
4314 | seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); | 4326 | seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); |
4315 | } | 4327 | } |
4316 | 4328 | ||
4317 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { | 4329 | for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { |
4318 | unsigned long long val = 0; | 4330 | unsigned long long val = 0; |
4319 | 4331 | ||
4320 | for_each_mem_cgroup_tree(mi, memcg) | 4332 | for_each_mem_cgroup_tree(mi, memcg) |
4321 | val += mem_cgroup_read_events(mi, i); | 4333 | val += mem_cgroup_read_events(mi, i); |
4322 | seq_printf(m, "total_%s %llu\n", | 4334 | seq_printf(m, "total_%s %llu\n", |
4323 | mem_cgroup_events_names[i], val); | 4335 | mem_cgroup_events_names[i], val); |
4324 | } | 4336 | } |
4325 | 4337 | ||
4326 | for (i = 0; i < NR_LRU_LISTS; i++) { | 4338 | for (i = 0; i < NR_LRU_LISTS; i++) { |
4327 | unsigned long long val = 0; | 4339 | unsigned long long val = 0; |
4328 | 4340 | ||
4329 | for_each_mem_cgroup_tree(mi, memcg) | 4341 | for_each_mem_cgroup_tree(mi, memcg) |
4330 | val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; | 4342 | val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; |
4331 | seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); | 4343 | seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); |
4332 | } | 4344 | } |
4333 | 4345 | ||
4334 | #ifdef CONFIG_DEBUG_VM | 4346 | #ifdef CONFIG_DEBUG_VM |
4335 | { | 4347 | { |
4336 | int nid, zid; | 4348 | int nid, zid; |
4337 | struct mem_cgroup_per_zone *mz; | 4349 | struct mem_cgroup_per_zone *mz; |
4338 | struct zone_reclaim_stat *rstat; | 4350 | struct zone_reclaim_stat *rstat; |
4339 | unsigned long recent_rotated[2] = {0, 0}; | 4351 | unsigned long recent_rotated[2] = {0, 0}; |
4340 | unsigned long recent_scanned[2] = {0, 0}; | 4352 | unsigned long recent_scanned[2] = {0, 0}; |
4341 | 4353 | ||
4342 | for_each_online_node(nid) | 4354 | for_each_online_node(nid) |
4343 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 4355 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
4344 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; | 4356 | mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; |
4345 | rstat = &mz->lruvec.reclaim_stat; | 4357 | rstat = &mz->lruvec.reclaim_stat; |
4346 | 4358 | ||
4347 | recent_rotated[0] += rstat->recent_rotated[0]; | 4359 | recent_rotated[0] += rstat->recent_rotated[0]; |
4348 | recent_rotated[1] += rstat->recent_rotated[1]; | 4360 | recent_rotated[1] += rstat->recent_rotated[1]; |
4349 | recent_scanned[0] += rstat->recent_scanned[0]; | 4361 | recent_scanned[0] += rstat->recent_scanned[0]; |
4350 | recent_scanned[1] += rstat->recent_scanned[1]; | 4362 | recent_scanned[1] += rstat->recent_scanned[1]; |
4351 | } | 4363 | } |
4352 | seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); | 4364 | seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); |
4353 | seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); | 4365 | seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); |
4354 | seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); | 4366 | seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); |
4355 | seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); | 4367 | seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); |
4356 | } | 4368 | } |
4357 | #endif | 4369 | #endif |
4358 | 4370 | ||
4359 | return 0; | 4371 | return 0; |
4360 | } | 4372 | } |
4361 | 4373 | ||
4362 | static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, | 4374 | static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, |
4363 | struct cftype *cft) | 4375 | struct cftype *cft) |
4364 | { | 4376 | { |
4365 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4377 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4366 | 4378 | ||
4367 | return mem_cgroup_swappiness(memcg); | 4379 | return mem_cgroup_swappiness(memcg); |
4368 | } | 4380 | } |
4369 | 4381 | ||
4370 | static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, | 4382 | static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, |
4371 | struct cftype *cft, u64 val) | 4383 | struct cftype *cft, u64 val) |
4372 | { | 4384 | { |
4373 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4385 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4374 | 4386 | ||
4375 | if (val > 100) | 4387 | if (val > 100) |
4376 | return -EINVAL; | 4388 | return -EINVAL; |
4377 | 4389 | ||
4378 | if (css->parent) | 4390 | if (css->parent) |
4379 | memcg->swappiness = val; | 4391 | memcg->swappiness = val; |
4380 | else | 4392 | else |
4381 | vm_swappiness = val; | 4393 | vm_swappiness = val; |
4382 | 4394 | ||
4383 | return 0; | 4395 | return 0; |
4384 | } | 4396 | } |
4385 | 4397 | ||
4386 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | 4398 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) |
4387 | { | 4399 | { |
4388 | struct mem_cgroup_threshold_ary *t; | 4400 | struct mem_cgroup_threshold_ary *t; |
4389 | unsigned long usage; | 4401 | unsigned long usage; |
4390 | int i; | 4402 | int i; |
4391 | 4403 | ||
4392 | rcu_read_lock(); | 4404 | rcu_read_lock(); |
4393 | if (!swap) | 4405 | if (!swap) |
4394 | t = rcu_dereference(memcg->thresholds.primary); | 4406 | t = rcu_dereference(memcg->thresholds.primary); |
4395 | else | 4407 | else |
4396 | t = rcu_dereference(memcg->memsw_thresholds.primary); | 4408 | t = rcu_dereference(memcg->memsw_thresholds.primary); |
4397 | 4409 | ||
4398 | if (!t) | 4410 | if (!t) |
4399 | goto unlock; | 4411 | goto unlock; |
4400 | 4412 | ||
4401 | usage = mem_cgroup_usage(memcg, swap); | 4413 | usage = mem_cgroup_usage(memcg, swap); |
4402 | 4414 | ||
4403 | /* | 4415 | /* |
4404 | * current_threshold points to threshold just below or equal to usage. | 4416 | * current_threshold points to threshold just below or equal to usage. |
4405 | * If it's not true, a threshold was crossed after last | 4417 | * If it's not true, a threshold was crossed after last |
4406 | * call of __mem_cgroup_threshold(). | 4418 | * call of __mem_cgroup_threshold(). |
4407 | */ | 4419 | */ |
4408 | i = t->current_threshold; | 4420 | i = t->current_threshold; |
4409 | 4421 | ||
4410 | /* | 4422 | /* |
4411 | * Iterate backward over array of thresholds starting from | 4423 | * Iterate backward over array of thresholds starting from |
4412 | * current_threshold and check if a threshold is crossed. | 4424 | * current_threshold and check if a threshold is crossed. |
4413 | * If none of thresholds below usage is crossed, we read | 4425 | * If none of thresholds below usage is crossed, we read |
4414 | * only one element of the array here. | 4426 | * only one element of the array here. |
4415 | */ | 4427 | */ |
4416 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | 4428 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) |
4417 | eventfd_signal(t->entries[i].eventfd, 1); | 4429 | eventfd_signal(t->entries[i].eventfd, 1); |
4418 | 4430 | ||
4419 | /* i = current_threshold + 1 */ | 4431 | /* i = current_threshold + 1 */ |
4420 | i++; | 4432 | i++; |
4421 | 4433 | ||
4422 | /* | 4434 | /* |
4423 | * Iterate forward over array of thresholds starting from | 4435 | * Iterate forward over array of thresholds starting from |
4424 | * current_threshold+1 and check if a threshold is crossed. | 4436 | * current_threshold+1 and check if a threshold is crossed. |
4425 | * If none of thresholds above usage is crossed, we read | 4437 | * If none of thresholds above usage is crossed, we read |
4426 | * only one element of the array here. | 4438 | * only one element of the array here. |
4427 | */ | 4439 | */ |
4428 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | 4440 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) |
4429 | eventfd_signal(t->entries[i].eventfd, 1); | 4441 | eventfd_signal(t->entries[i].eventfd, 1); |
4430 | 4442 | ||
4431 | /* Update current_threshold */ | 4443 | /* Update current_threshold */ |
4432 | t->current_threshold = i - 1; | 4444 | t->current_threshold = i - 1; |
4433 | unlock: | 4445 | unlock: |
4434 | rcu_read_unlock(); | 4446 | rcu_read_unlock(); |
4435 | } | 4447 | } |
4436 | 4448 | ||
4437 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | 4449 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) |
4438 | { | 4450 | { |
4439 | while (memcg) { | 4451 | while (memcg) { |
4440 | __mem_cgroup_threshold(memcg, false); | 4452 | __mem_cgroup_threshold(memcg, false); |
4441 | if (do_swap_account) | 4453 | if (do_swap_account) |
4442 | __mem_cgroup_threshold(memcg, true); | 4454 | __mem_cgroup_threshold(memcg, true); |
4443 | 4455 | ||
4444 | memcg = parent_mem_cgroup(memcg); | 4456 | memcg = parent_mem_cgroup(memcg); |
4445 | } | 4457 | } |
4446 | } | 4458 | } |
4447 | 4459 | ||
4448 | static int compare_thresholds(const void *a, const void *b) | 4460 | static int compare_thresholds(const void *a, const void *b) |
4449 | { | 4461 | { |
4450 | const struct mem_cgroup_threshold *_a = a; | 4462 | const struct mem_cgroup_threshold *_a = a; |
4451 | const struct mem_cgroup_threshold *_b = b; | 4463 | const struct mem_cgroup_threshold *_b = b; |
4452 | 4464 | ||
4453 | if (_a->threshold > _b->threshold) | 4465 | if (_a->threshold > _b->threshold) |
4454 | return 1; | 4466 | return 1; |
4455 | 4467 | ||
4456 | if (_a->threshold < _b->threshold) | 4468 | if (_a->threshold < _b->threshold) |
4457 | return -1; | 4469 | return -1; |
4458 | 4470 | ||
4459 | return 0; | 4471 | return 0; |
4460 | } | 4472 | } |
4461 | 4473 | ||
4462 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) | 4474 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) |
4463 | { | 4475 | { |
4464 | struct mem_cgroup_eventfd_list *ev; | 4476 | struct mem_cgroup_eventfd_list *ev; |
4465 | 4477 | ||
4466 | spin_lock(&memcg_oom_lock); | 4478 | spin_lock(&memcg_oom_lock); |
4467 | 4479 | ||
4468 | list_for_each_entry(ev, &memcg->oom_notify, list) | 4480 | list_for_each_entry(ev, &memcg->oom_notify, list) |
4469 | eventfd_signal(ev->eventfd, 1); | 4481 | eventfd_signal(ev->eventfd, 1); |
4470 | 4482 | ||
4471 | spin_unlock(&memcg_oom_lock); | 4483 | spin_unlock(&memcg_oom_lock); |
4472 | return 0; | 4484 | return 0; |
4473 | } | 4485 | } |
4474 | 4486 | ||
4475 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) | 4487 | static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) |
4476 | { | 4488 | { |
4477 | struct mem_cgroup *iter; | 4489 | struct mem_cgroup *iter; |
4478 | 4490 | ||
4479 | for_each_mem_cgroup_tree(iter, memcg) | 4491 | for_each_mem_cgroup_tree(iter, memcg) |
4480 | mem_cgroup_oom_notify_cb(iter); | 4492 | mem_cgroup_oom_notify_cb(iter); |
4481 | } | 4493 | } |
4482 | 4494 | ||
4483 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | 4495 | static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
4484 | struct eventfd_ctx *eventfd, const char *args, enum res_type type) | 4496 | struct eventfd_ctx *eventfd, const char *args, enum res_type type) |
4485 | { | 4497 | { |
4486 | struct mem_cgroup_thresholds *thresholds; | 4498 | struct mem_cgroup_thresholds *thresholds; |
4487 | struct mem_cgroup_threshold_ary *new; | 4499 | struct mem_cgroup_threshold_ary *new; |
4488 | unsigned long threshold; | 4500 | unsigned long threshold; |
4489 | unsigned long usage; | 4501 | unsigned long usage; |
4490 | int i, size, ret; | 4502 | int i, size, ret; |
4491 | 4503 | ||
4492 | ret = page_counter_memparse(args, &threshold); | 4504 | ret = page_counter_memparse(args, &threshold); |
4493 | if (ret) | 4505 | if (ret) |
4494 | return ret; | 4506 | return ret; |
4495 | 4507 | ||
4496 | mutex_lock(&memcg->thresholds_lock); | 4508 | mutex_lock(&memcg->thresholds_lock); |
4497 | 4509 | ||
4498 | if (type == _MEM) { | 4510 | if (type == _MEM) { |
4499 | thresholds = &memcg->thresholds; | 4511 | thresholds = &memcg->thresholds; |
4500 | usage = mem_cgroup_usage(memcg, false); | 4512 | usage = mem_cgroup_usage(memcg, false); |
4501 | } else if (type == _MEMSWAP) { | 4513 | } else if (type == _MEMSWAP) { |
4502 | thresholds = &memcg->memsw_thresholds; | 4514 | thresholds = &memcg->memsw_thresholds; |
4503 | usage = mem_cgroup_usage(memcg, true); | 4515 | usage = mem_cgroup_usage(memcg, true); |
4504 | } else | 4516 | } else |
4505 | BUG(); | 4517 | BUG(); |
4506 | 4518 | ||
4507 | /* Check if a threshold crossed before adding a new one */ | 4519 | /* Check if a threshold crossed before adding a new one */ |
4508 | if (thresholds->primary) | 4520 | if (thresholds->primary) |
4509 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 4521 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
4510 | 4522 | ||
4511 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; | 4523 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
4512 | 4524 | ||
4513 | /* Allocate memory for new array of thresholds */ | 4525 | /* Allocate memory for new array of thresholds */ |
4514 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), | 4526 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), |
4515 | GFP_KERNEL); | 4527 | GFP_KERNEL); |
4516 | if (!new) { | 4528 | if (!new) { |
4517 | ret = -ENOMEM; | 4529 | ret = -ENOMEM; |
4518 | goto unlock; | 4530 | goto unlock; |
4519 | } | 4531 | } |
4520 | new->size = size; | 4532 | new->size = size; |
4521 | 4533 | ||
4522 | /* Copy thresholds (if any) to new array */ | 4534 | /* Copy thresholds (if any) to new array */ |
4523 | if (thresholds->primary) { | 4535 | if (thresholds->primary) { |
4524 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * | 4536 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * |
4525 | sizeof(struct mem_cgroup_threshold)); | 4537 | sizeof(struct mem_cgroup_threshold)); |
4526 | } | 4538 | } |
4527 | 4539 | ||
4528 | /* Add new threshold */ | 4540 | /* Add new threshold */ |
4529 | new->entries[size - 1].eventfd = eventfd; | 4541 | new->entries[size - 1].eventfd = eventfd; |
4530 | new->entries[size - 1].threshold = threshold; | 4542 | new->entries[size - 1].threshold = threshold; |
4531 | 4543 | ||
4532 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | 4544 | /* Sort thresholds. Registering of new threshold isn't time-critical */ |
4533 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), | 4545 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), |
4534 | compare_thresholds, NULL); | 4546 | compare_thresholds, NULL); |
4535 | 4547 | ||
4536 | /* Find current threshold */ | 4548 | /* Find current threshold */ |
4537 | new->current_threshold = -1; | 4549 | new->current_threshold = -1; |
4538 | for (i = 0; i < size; i++) { | 4550 | for (i = 0; i < size; i++) { |
4539 | if (new->entries[i].threshold <= usage) { | 4551 | if (new->entries[i].threshold <= usage) { |
4540 | /* | 4552 | /* |
4541 | * new->current_threshold will not be used until | 4553 | * new->current_threshold will not be used until |
4542 | * rcu_assign_pointer(), so it's safe to increment | 4554 | * rcu_assign_pointer(), so it's safe to increment |
4543 | * it here. | 4555 | * it here. |
4544 | */ | 4556 | */ |
4545 | ++new->current_threshold; | 4557 | ++new->current_threshold; |
4546 | } else | 4558 | } else |
4547 | break; | 4559 | break; |
4548 | } | 4560 | } |
4549 | 4561 | ||
4550 | /* Free old spare buffer and save old primary buffer as spare */ | 4562 | /* Free old spare buffer and save old primary buffer as spare */ |
4551 | kfree(thresholds->spare); | 4563 | kfree(thresholds->spare); |
4552 | thresholds->spare = thresholds->primary; | 4564 | thresholds->spare = thresholds->primary; |
4553 | 4565 | ||
4554 | rcu_assign_pointer(thresholds->primary, new); | 4566 | rcu_assign_pointer(thresholds->primary, new); |
4555 | 4567 | ||
4556 | /* To be sure that nobody uses thresholds */ | 4568 | /* To be sure that nobody uses thresholds */ |
4557 | synchronize_rcu(); | 4569 | synchronize_rcu(); |
4558 | 4570 | ||
4559 | unlock: | 4571 | unlock: |
4560 | mutex_unlock(&memcg->thresholds_lock); | 4572 | mutex_unlock(&memcg->thresholds_lock); |
4561 | 4573 | ||
4562 | return ret; | 4574 | return ret; |
4563 | } | 4575 | } |
4564 | 4576 | ||
4565 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, | 4577 | static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, |
4566 | struct eventfd_ctx *eventfd, const char *args) | 4578 | struct eventfd_ctx *eventfd, const char *args) |
4567 | { | 4579 | { |
4568 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); | 4580 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); |
4569 | } | 4581 | } |
4570 | 4582 | ||
4571 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, | 4583 | static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, |
4572 | struct eventfd_ctx *eventfd, const char *args) | 4584 | struct eventfd_ctx *eventfd, const char *args) |
4573 | { | 4585 | { |
4574 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); | 4586 | return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); |
4575 | } | 4587 | } |
4576 | 4588 | ||
4577 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | 4589 | static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
4578 | struct eventfd_ctx *eventfd, enum res_type type) | 4590 | struct eventfd_ctx *eventfd, enum res_type type) |
4579 | { | 4591 | { |
4580 | struct mem_cgroup_thresholds *thresholds; | 4592 | struct mem_cgroup_thresholds *thresholds; |
4581 | struct mem_cgroup_threshold_ary *new; | 4593 | struct mem_cgroup_threshold_ary *new; |
4582 | unsigned long usage; | 4594 | unsigned long usage; |
4583 | int i, j, size; | 4595 | int i, j, size; |
4584 | 4596 | ||
4585 | mutex_lock(&memcg->thresholds_lock); | 4597 | mutex_lock(&memcg->thresholds_lock); |
4586 | 4598 | ||
4587 | if (type == _MEM) { | 4599 | if (type == _MEM) { |
4588 | thresholds = &memcg->thresholds; | 4600 | thresholds = &memcg->thresholds; |
4589 | usage = mem_cgroup_usage(memcg, false); | 4601 | usage = mem_cgroup_usage(memcg, false); |
4590 | } else if (type == _MEMSWAP) { | 4602 | } else if (type == _MEMSWAP) { |
4591 | thresholds = &memcg->memsw_thresholds; | 4603 | thresholds = &memcg->memsw_thresholds; |
4592 | usage = mem_cgroup_usage(memcg, true); | 4604 | usage = mem_cgroup_usage(memcg, true); |
4593 | } else | 4605 | } else |
4594 | BUG(); | 4606 | BUG(); |
4595 | 4607 | ||
4596 | if (!thresholds->primary) | 4608 | if (!thresholds->primary) |
4597 | goto unlock; | 4609 | goto unlock; |
4598 | 4610 | ||
4599 | /* Check if a threshold crossed before removing */ | 4611 | /* Check if a threshold crossed before removing */ |
4600 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 4612 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
4601 | 4613 | ||
4602 | /* Calculate new number of threshold */ | 4614 | /* Calculate new number of threshold */ |
4603 | size = 0; | 4615 | size = 0; |
4604 | for (i = 0; i < thresholds->primary->size; i++) { | 4616 | for (i = 0; i < thresholds->primary->size; i++) { |
4605 | if (thresholds->primary->entries[i].eventfd != eventfd) | 4617 | if (thresholds->primary->entries[i].eventfd != eventfd) |
4606 | size++; | 4618 | size++; |
4607 | } | 4619 | } |
4608 | 4620 | ||
4609 | new = thresholds->spare; | 4621 | new = thresholds->spare; |
4610 | 4622 | ||
4611 | /* Set thresholds array to NULL if we don't have thresholds */ | 4623 | /* Set thresholds array to NULL if we don't have thresholds */ |
4612 | if (!size) { | 4624 | if (!size) { |
4613 | kfree(new); | 4625 | kfree(new); |
4614 | new = NULL; | 4626 | new = NULL; |
4615 | goto swap_buffers; | 4627 | goto swap_buffers; |
4616 | } | 4628 | } |
4617 | 4629 | ||
4618 | new->size = size; | 4630 | new->size = size; |
4619 | 4631 | ||
4620 | /* Copy thresholds and find current threshold */ | 4632 | /* Copy thresholds and find current threshold */ |
4621 | new->current_threshold = -1; | 4633 | new->current_threshold = -1; |
4622 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { | 4634 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
4623 | if (thresholds->primary->entries[i].eventfd == eventfd) | 4635 | if (thresholds->primary->entries[i].eventfd == eventfd) |
4624 | continue; | 4636 | continue; |
4625 | 4637 | ||
4626 | new->entries[j] = thresholds->primary->entries[i]; | 4638 | new->entries[j] = thresholds->primary->entries[i]; |
4627 | if (new->entries[j].threshold <= usage) { | 4639 | if (new->entries[j].threshold <= usage) { |
4628 | /* | 4640 | /* |
4629 | * new->current_threshold will not be used | 4641 | * new->current_threshold will not be used |
4630 | * until rcu_assign_pointer(), so it's safe to increment | 4642 | * until rcu_assign_pointer(), so it's safe to increment |
4631 | * it here. | 4643 | * it here. |
4632 | */ | 4644 | */ |
4633 | ++new->current_threshold; | 4645 | ++new->current_threshold; |
4634 | } | 4646 | } |
4635 | j++; | 4647 | j++; |
4636 | } | 4648 | } |
4637 | 4649 | ||
4638 | swap_buffers: | 4650 | swap_buffers: |
4639 | /* Swap primary and spare array */ | 4651 | /* Swap primary and spare array */ |
4640 | thresholds->spare = thresholds->primary; | 4652 | thresholds->spare = thresholds->primary; |
4641 | /* If all events are unregistered, free the spare array */ | 4653 | /* If all events are unregistered, free the spare array */ |
4642 | if (!new) { | 4654 | if (!new) { |
4643 | kfree(thresholds->spare); | 4655 | kfree(thresholds->spare); |
4644 | thresholds->spare = NULL; | 4656 | thresholds->spare = NULL; |
4645 | } | 4657 | } |
4646 | 4658 | ||
4647 | rcu_assign_pointer(thresholds->primary, new); | 4659 | rcu_assign_pointer(thresholds->primary, new); |
4648 | 4660 | ||
4649 | /* To be sure that nobody uses thresholds */ | 4661 | /* To be sure that nobody uses thresholds */ |
4650 | synchronize_rcu(); | 4662 | synchronize_rcu(); |
4651 | unlock: | 4663 | unlock: |
4652 | mutex_unlock(&memcg->thresholds_lock); | 4664 | mutex_unlock(&memcg->thresholds_lock); |
4653 | } | 4665 | } |
4654 | 4666 | ||
4655 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | 4667 | static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
4656 | struct eventfd_ctx *eventfd) | 4668 | struct eventfd_ctx *eventfd) |
4657 | { | 4669 | { |
4658 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); | 4670 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); |
4659 | } | 4671 | } |
4660 | 4672 | ||
4661 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | 4673 | static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, |
4662 | struct eventfd_ctx *eventfd) | 4674 | struct eventfd_ctx *eventfd) |
4663 | { | 4675 | { |
4664 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); | 4676 | return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); |
4665 | } | 4677 | } |
4666 | 4678 | ||
4667 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, | 4679 | static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, |
4668 | struct eventfd_ctx *eventfd, const char *args) | 4680 | struct eventfd_ctx *eventfd, const char *args) |
4669 | { | 4681 | { |
4670 | struct mem_cgroup_eventfd_list *event; | 4682 | struct mem_cgroup_eventfd_list *event; |
4671 | 4683 | ||
4672 | event = kmalloc(sizeof(*event), GFP_KERNEL); | 4684 | event = kmalloc(sizeof(*event), GFP_KERNEL); |
4673 | if (!event) | 4685 | if (!event) |
4674 | return -ENOMEM; | 4686 | return -ENOMEM; |
4675 | 4687 | ||
4676 | spin_lock(&memcg_oom_lock); | 4688 | spin_lock(&memcg_oom_lock); |
4677 | 4689 | ||
4678 | event->eventfd = eventfd; | 4690 | event->eventfd = eventfd; |
4679 | list_add(&event->list, &memcg->oom_notify); | 4691 | list_add(&event->list, &memcg->oom_notify); |
4680 | 4692 | ||
4681 | /* already in OOM ? */ | 4693 | /* already in OOM ? */ |
4682 | if (atomic_read(&memcg->under_oom)) | 4694 | if (atomic_read(&memcg->under_oom)) |
4683 | eventfd_signal(eventfd, 1); | 4695 | eventfd_signal(eventfd, 1); |
4684 | spin_unlock(&memcg_oom_lock); | 4696 | spin_unlock(&memcg_oom_lock); |
4685 | 4697 | ||
4686 | return 0; | 4698 | return 0; |
4687 | } | 4699 | } |
4688 | 4700 | ||
4689 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, | 4701 | static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, |
4690 | struct eventfd_ctx *eventfd) | 4702 | struct eventfd_ctx *eventfd) |
4691 | { | 4703 | { |
4692 | struct mem_cgroup_eventfd_list *ev, *tmp; | 4704 | struct mem_cgroup_eventfd_list *ev, *tmp; |
4693 | 4705 | ||
4694 | spin_lock(&memcg_oom_lock); | 4706 | spin_lock(&memcg_oom_lock); |
4695 | 4707 | ||
4696 | list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { | 4708 | list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { |
4697 | if (ev->eventfd == eventfd) { | 4709 | if (ev->eventfd == eventfd) { |
4698 | list_del(&ev->list); | 4710 | list_del(&ev->list); |
4699 | kfree(ev); | 4711 | kfree(ev); |
4700 | } | 4712 | } |
4701 | } | 4713 | } |
4702 | 4714 | ||
4703 | spin_unlock(&memcg_oom_lock); | 4715 | spin_unlock(&memcg_oom_lock); |
4704 | } | 4716 | } |
4705 | 4717 | ||
4706 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) | 4718 | static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) |
4707 | { | 4719 | { |
4708 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); | 4720 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); |
4709 | 4721 | ||
4710 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); | 4722 | seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); |
4711 | seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); | 4723 | seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); |
4712 | return 0; | 4724 | return 0; |
4713 | } | 4725 | } |
4714 | 4726 | ||
4715 | static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, | 4727 | static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, |
4716 | struct cftype *cft, u64 val) | 4728 | struct cftype *cft, u64 val) |
4717 | { | 4729 | { |
4718 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4730 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4719 | 4731 | ||
4720 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | 4732 | /* cannot set to root cgroup and only 0 and 1 are allowed */ |
4721 | if (!css->parent || !((val == 0) || (val == 1))) | 4733 | if (!css->parent || !((val == 0) || (val == 1))) |
4722 | return -EINVAL; | 4734 | return -EINVAL; |
4723 | 4735 | ||
4724 | memcg->oom_kill_disable = val; | 4736 | memcg->oom_kill_disable = val; |
4725 | if (!val) | 4737 | if (!val) |
4726 | memcg_oom_recover(memcg); | 4738 | memcg_oom_recover(memcg); |
4727 | 4739 | ||
4728 | return 0; | 4740 | return 0; |
4729 | } | 4741 | } |
4730 | 4742 | ||
4731 | #ifdef CONFIG_MEMCG_KMEM | 4743 | #ifdef CONFIG_MEMCG_KMEM |
4732 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 4744 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4733 | { | 4745 | { |
4734 | int ret; | 4746 | int ret; |
4735 | 4747 | ||
4736 | memcg->kmemcg_id = -1; | 4748 | memcg->kmemcg_id = -1; |
4737 | ret = memcg_propagate_kmem(memcg); | 4749 | ret = memcg_propagate_kmem(memcg); |
4738 | if (ret) | 4750 | if (ret) |
4739 | return ret; | 4751 | return ret; |
4740 | 4752 | ||
4741 | return mem_cgroup_sockets_init(memcg, ss); | 4753 | return mem_cgroup_sockets_init(memcg, ss); |
4742 | } | 4754 | } |
4743 | 4755 | ||
4744 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4756 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4745 | { | 4757 | { |
4746 | mem_cgroup_sockets_destroy(memcg); | 4758 | mem_cgroup_sockets_destroy(memcg); |
4747 | } | 4759 | } |
4748 | 4760 | ||
4749 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | 4761 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) |
4750 | { | 4762 | { |
4751 | if (!memcg_kmem_is_active(memcg)) | 4763 | if (!memcg_kmem_is_active(memcg)) |
4752 | return; | 4764 | return; |
4753 | 4765 | ||
4754 | /* | 4766 | /* |
4755 | * kmem charges can outlive the cgroup. In the case of slab | 4767 | * kmem charges can outlive the cgroup. In the case of slab |
4756 | * pages, for instance, a page contain objects from various | 4768 | * pages, for instance, a page contain objects from various |
4757 | * processes. As we prevent from taking a reference for every | 4769 | * processes. As we prevent from taking a reference for every |
4758 | * such allocation we have to be careful when doing uncharge | 4770 | * such allocation we have to be careful when doing uncharge |
4759 | * (see memcg_uncharge_kmem) and here during offlining. | 4771 | * (see memcg_uncharge_kmem) and here during offlining. |
4760 | * | 4772 | * |
4761 | * The idea is that that only the _last_ uncharge which sees | 4773 | * The idea is that that only the _last_ uncharge which sees |
4762 | * the dead memcg will drop the last reference. An additional | 4774 | * the dead memcg will drop the last reference. An additional |
4763 | * reference is taken here before the group is marked dead | 4775 | * reference is taken here before the group is marked dead |
4764 | * which is then paired with css_put during uncharge resp. here. | 4776 | * which is then paired with css_put during uncharge resp. here. |
4765 | * | 4777 | * |
4766 | * Although this might sound strange as this path is called from | 4778 | * Although this might sound strange as this path is called from |
4767 | * css_offline() when the referencemight have dropped down to 0 and | 4779 | * css_offline() when the referencemight have dropped down to 0 and |
4768 | * shouldn't be incremented anymore (css_tryget_online() would | 4780 | * shouldn't be incremented anymore (css_tryget_online() would |
4769 | * fail) we do not have other options because of the kmem | 4781 | * fail) we do not have other options because of the kmem |
4770 | * allocations lifetime. | 4782 | * allocations lifetime. |
4771 | */ | 4783 | */ |
4772 | css_get(&memcg->css); | 4784 | css_get(&memcg->css); |
4773 | 4785 | ||
4774 | memcg_kmem_mark_dead(memcg); | 4786 | memcg_kmem_mark_dead(memcg); |
4775 | 4787 | ||
4776 | if (page_counter_read(&memcg->kmem)) | 4788 | if (page_counter_read(&memcg->kmem)) |
4777 | return; | 4789 | return; |
4778 | 4790 | ||
4779 | if (memcg_kmem_test_and_clear_dead(memcg)) | 4791 | if (memcg_kmem_test_and_clear_dead(memcg)) |
4780 | css_put(&memcg->css); | 4792 | css_put(&memcg->css); |
4781 | } | 4793 | } |
4782 | #else | 4794 | #else |
4783 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | 4795 | static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) |
4784 | { | 4796 | { |
4785 | return 0; | 4797 | return 0; |
4786 | } | 4798 | } |
4787 | 4799 | ||
4788 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) | 4800 | static void memcg_destroy_kmem(struct mem_cgroup *memcg) |
4789 | { | 4801 | { |
4790 | } | 4802 | } |
4791 | 4803 | ||
4792 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) | 4804 | static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) |
4793 | { | 4805 | { |
4794 | } | 4806 | } |
4795 | #endif | 4807 | #endif |
4796 | 4808 | ||
4797 | /* | 4809 | /* |
4798 | * DO NOT USE IN NEW FILES. | 4810 | * DO NOT USE IN NEW FILES. |
4799 | * | 4811 | * |
4800 | * "cgroup.event_control" implementation. | 4812 | * "cgroup.event_control" implementation. |
4801 | * | 4813 | * |
4802 | * This is way over-engineered. It tries to support fully configurable | 4814 | * This is way over-engineered. It tries to support fully configurable |
4803 | * events for each user. Such level of flexibility is completely | 4815 | * events for each user. Such level of flexibility is completely |
4804 | * unnecessary especially in the light of the planned unified hierarchy. | 4816 | * unnecessary especially in the light of the planned unified hierarchy. |
4805 | * | 4817 | * |
4806 | * Please deprecate this and replace with something simpler if at all | 4818 | * Please deprecate this and replace with something simpler if at all |
4807 | * possible. | 4819 | * possible. |
4808 | */ | 4820 | */ |
4809 | 4821 | ||
4810 | /* | 4822 | /* |
4811 | * Unregister event and free resources. | 4823 | * Unregister event and free resources. |
4812 | * | 4824 | * |
4813 | * Gets called from workqueue. | 4825 | * Gets called from workqueue. |
4814 | */ | 4826 | */ |
4815 | static void memcg_event_remove(struct work_struct *work) | 4827 | static void memcg_event_remove(struct work_struct *work) |
4816 | { | 4828 | { |
4817 | struct mem_cgroup_event *event = | 4829 | struct mem_cgroup_event *event = |
4818 | container_of(work, struct mem_cgroup_event, remove); | 4830 | container_of(work, struct mem_cgroup_event, remove); |
4819 | struct mem_cgroup *memcg = event->memcg; | 4831 | struct mem_cgroup *memcg = event->memcg; |
4820 | 4832 | ||
4821 | remove_wait_queue(event->wqh, &event->wait); | 4833 | remove_wait_queue(event->wqh, &event->wait); |
4822 | 4834 | ||
4823 | event->unregister_event(memcg, event->eventfd); | 4835 | event->unregister_event(memcg, event->eventfd); |
4824 | 4836 | ||
4825 | /* Notify userspace the event is going away. */ | 4837 | /* Notify userspace the event is going away. */ |
4826 | eventfd_signal(event->eventfd, 1); | 4838 | eventfd_signal(event->eventfd, 1); |
4827 | 4839 | ||
4828 | eventfd_ctx_put(event->eventfd); | 4840 | eventfd_ctx_put(event->eventfd); |
4829 | kfree(event); | 4841 | kfree(event); |
4830 | css_put(&memcg->css); | 4842 | css_put(&memcg->css); |
4831 | } | 4843 | } |
4832 | 4844 | ||
4833 | /* | 4845 | /* |
4834 | * Gets called on POLLHUP on eventfd when user closes it. | 4846 | * Gets called on POLLHUP on eventfd when user closes it. |
4835 | * | 4847 | * |
4836 | * Called with wqh->lock held and interrupts disabled. | 4848 | * Called with wqh->lock held and interrupts disabled. |
4837 | */ | 4849 | */ |
4838 | static int memcg_event_wake(wait_queue_t *wait, unsigned mode, | 4850 | static int memcg_event_wake(wait_queue_t *wait, unsigned mode, |
4839 | int sync, void *key) | 4851 | int sync, void *key) |
4840 | { | 4852 | { |
4841 | struct mem_cgroup_event *event = | 4853 | struct mem_cgroup_event *event = |
4842 | container_of(wait, struct mem_cgroup_event, wait); | 4854 | container_of(wait, struct mem_cgroup_event, wait); |
4843 | struct mem_cgroup *memcg = event->memcg; | 4855 | struct mem_cgroup *memcg = event->memcg; |
4844 | unsigned long flags = (unsigned long)key; | 4856 | unsigned long flags = (unsigned long)key; |
4845 | 4857 | ||
4846 | if (flags & POLLHUP) { | 4858 | if (flags & POLLHUP) { |
4847 | /* | 4859 | /* |
4848 | * If the event has been detached at cgroup removal, we | 4860 | * If the event has been detached at cgroup removal, we |
4849 | * can simply return knowing the other side will cleanup | 4861 | * can simply return knowing the other side will cleanup |
4850 | * for us. | 4862 | * for us. |
4851 | * | 4863 | * |
4852 | * We can't race against event freeing since the other | 4864 | * We can't race against event freeing since the other |
4853 | * side will require wqh->lock via remove_wait_queue(), | 4865 | * side will require wqh->lock via remove_wait_queue(), |
4854 | * which we hold. | 4866 | * which we hold. |
4855 | */ | 4867 | */ |
4856 | spin_lock(&memcg->event_list_lock); | 4868 | spin_lock(&memcg->event_list_lock); |
4857 | if (!list_empty(&event->list)) { | 4869 | if (!list_empty(&event->list)) { |
4858 | list_del_init(&event->list); | 4870 | list_del_init(&event->list); |
4859 | /* | 4871 | /* |
4860 | * We are in atomic context, but cgroup_event_remove() | 4872 | * We are in atomic context, but cgroup_event_remove() |
4861 | * may sleep, so we have to call it in workqueue. | 4873 | * may sleep, so we have to call it in workqueue. |
4862 | */ | 4874 | */ |
4863 | schedule_work(&event->remove); | 4875 | schedule_work(&event->remove); |
4864 | } | 4876 | } |
4865 | spin_unlock(&memcg->event_list_lock); | 4877 | spin_unlock(&memcg->event_list_lock); |
4866 | } | 4878 | } |
4867 | 4879 | ||
4868 | return 0; | 4880 | return 0; |
4869 | } | 4881 | } |
4870 | 4882 | ||
4871 | static void memcg_event_ptable_queue_proc(struct file *file, | 4883 | static void memcg_event_ptable_queue_proc(struct file *file, |
4872 | wait_queue_head_t *wqh, poll_table *pt) | 4884 | wait_queue_head_t *wqh, poll_table *pt) |
4873 | { | 4885 | { |
4874 | struct mem_cgroup_event *event = | 4886 | struct mem_cgroup_event *event = |
4875 | container_of(pt, struct mem_cgroup_event, pt); | 4887 | container_of(pt, struct mem_cgroup_event, pt); |
4876 | 4888 | ||
4877 | event->wqh = wqh; | 4889 | event->wqh = wqh; |
4878 | add_wait_queue(wqh, &event->wait); | 4890 | add_wait_queue(wqh, &event->wait); |
4879 | } | 4891 | } |
4880 | 4892 | ||
4881 | /* | 4893 | /* |
4882 | * DO NOT USE IN NEW FILES. | 4894 | * DO NOT USE IN NEW FILES. |
4883 | * | 4895 | * |
4884 | * Parse input and register new cgroup event handler. | 4896 | * Parse input and register new cgroup event handler. |
4885 | * | 4897 | * |
4886 | * Input must be in format '<event_fd> <control_fd> <args>'. | 4898 | * Input must be in format '<event_fd> <control_fd> <args>'. |
4887 | * Interpretation of args is defined by control file implementation. | 4899 | * Interpretation of args is defined by control file implementation. |
4888 | */ | 4900 | */ |
4889 | static ssize_t memcg_write_event_control(struct kernfs_open_file *of, | 4901 | static ssize_t memcg_write_event_control(struct kernfs_open_file *of, |
4890 | char *buf, size_t nbytes, loff_t off) | 4902 | char *buf, size_t nbytes, loff_t off) |
4891 | { | 4903 | { |
4892 | struct cgroup_subsys_state *css = of_css(of); | 4904 | struct cgroup_subsys_state *css = of_css(of); |
4893 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 4905 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
4894 | struct mem_cgroup_event *event; | 4906 | struct mem_cgroup_event *event; |
4895 | struct cgroup_subsys_state *cfile_css; | 4907 | struct cgroup_subsys_state *cfile_css; |
4896 | unsigned int efd, cfd; | 4908 | unsigned int efd, cfd; |
4897 | struct fd efile; | 4909 | struct fd efile; |
4898 | struct fd cfile; | 4910 | struct fd cfile; |
4899 | const char *name; | 4911 | const char *name; |
4900 | char *endp; | 4912 | char *endp; |
4901 | int ret; | 4913 | int ret; |
4902 | 4914 | ||
4903 | buf = strstrip(buf); | 4915 | buf = strstrip(buf); |
4904 | 4916 | ||
4905 | efd = simple_strtoul(buf, &endp, 10); | 4917 | efd = simple_strtoul(buf, &endp, 10); |
4906 | if (*endp != ' ') | 4918 | if (*endp != ' ') |
4907 | return -EINVAL; | 4919 | return -EINVAL; |
4908 | buf = endp + 1; | 4920 | buf = endp + 1; |
4909 | 4921 | ||
4910 | cfd = simple_strtoul(buf, &endp, 10); | 4922 | cfd = simple_strtoul(buf, &endp, 10); |
4911 | if ((*endp != ' ') && (*endp != '\0')) | 4923 | if ((*endp != ' ') && (*endp != '\0')) |
4912 | return -EINVAL; | 4924 | return -EINVAL; |
4913 | buf = endp + 1; | 4925 | buf = endp + 1; |
4914 | 4926 | ||
4915 | event = kzalloc(sizeof(*event), GFP_KERNEL); | 4927 | event = kzalloc(sizeof(*event), GFP_KERNEL); |
4916 | if (!event) | 4928 | if (!event) |
4917 | return -ENOMEM; | 4929 | return -ENOMEM; |
4918 | 4930 | ||
4919 | event->memcg = memcg; | 4931 | event->memcg = memcg; |
4920 | INIT_LIST_HEAD(&event->list); | 4932 | INIT_LIST_HEAD(&event->list); |
4921 | init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); | 4933 | init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); |
4922 | init_waitqueue_func_entry(&event->wait, memcg_event_wake); | 4934 | init_waitqueue_func_entry(&event->wait, memcg_event_wake); |
4923 | INIT_WORK(&event->remove, memcg_event_remove); | 4935 | INIT_WORK(&event->remove, memcg_event_remove); |
4924 | 4936 | ||
4925 | efile = fdget(efd); | 4937 | efile = fdget(efd); |
4926 | if (!efile.file) { | 4938 | if (!efile.file) { |
4927 | ret = -EBADF; | 4939 | ret = -EBADF; |
4928 | goto out_kfree; | 4940 | goto out_kfree; |
4929 | } | 4941 | } |
4930 | 4942 | ||
4931 | event->eventfd = eventfd_ctx_fileget(efile.file); | 4943 | event->eventfd = eventfd_ctx_fileget(efile.file); |
4932 | if (IS_ERR(event->eventfd)) { | 4944 | if (IS_ERR(event->eventfd)) { |
4933 | ret = PTR_ERR(event->eventfd); | 4945 | ret = PTR_ERR(event->eventfd); |
4934 | goto out_put_efile; | 4946 | goto out_put_efile; |
4935 | } | 4947 | } |
4936 | 4948 | ||
4937 | cfile = fdget(cfd); | 4949 | cfile = fdget(cfd); |
4938 | if (!cfile.file) { | 4950 | if (!cfile.file) { |
4939 | ret = -EBADF; | 4951 | ret = -EBADF; |
4940 | goto out_put_eventfd; | 4952 | goto out_put_eventfd; |
4941 | } | 4953 | } |
4942 | 4954 | ||
4943 | /* the process need read permission on control file */ | 4955 | /* the process need read permission on control file */ |
4944 | /* AV: shouldn't we check that it's been opened for read instead? */ | 4956 | /* AV: shouldn't we check that it's been opened for read instead? */ |
4945 | ret = inode_permission(file_inode(cfile.file), MAY_READ); | 4957 | ret = inode_permission(file_inode(cfile.file), MAY_READ); |
4946 | if (ret < 0) | 4958 | if (ret < 0) |
4947 | goto out_put_cfile; | 4959 | goto out_put_cfile; |
4948 | 4960 | ||
4949 | /* | 4961 | /* |
4950 | * Determine the event callbacks and set them in @event. This used | 4962 | * Determine the event callbacks and set them in @event. This used |
4951 | * to be done via struct cftype but cgroup core no longer knows | 4963 | * to be done via struct cftype but cgroup core no longer knows |
4952 | * about these events. The following is crude but the whole thing | 4964 | * about these events. The following is crude but the whole thing |
4953 | * is for compatibility anyway. | 4965 | * is for compatibility anyway. |
4954 | * | 4966 | * |
4955 | * DO NOT ADD NEW FILES. | 4967 | * DO NOT ADD NEW FILES. |
4956 | */ | 4968 | */ |
4957 | name = cfile.file->f_dentry->d_name.name; | 4969 | name = cfile.file->f_dentry->d_name.name; |
4958 | 4970 | ||
4959 | if (!strcmp(name, "memory.usage_in_bytes")) { | 4971 | if (!strcmp(name, "memory.usage_in_bytes")) { |
4960 | event->register_event = mem_cgroup_usage_register_event; | 4972 | event->register_event = mem_cgroup_usage_register_event; |
4961 | event->unregister_event = mem_cgroup_usage_unregister_event; | 4973 | event->unregister_event = mem_cgroup_usage_unregister_event; |
4962 | } else if (!strcmp(name, "memory.oom_control")) { | 4974 | } else if (!strcmp(name, "memory.oom_control")) { |
4963 | event->register_event = mem_cgroup_oom_register_event; | 4975 | event->register_event = mem_cgroup_oom_register_event; |
4964 | event->unregister_event = mem_cgroup_oom_unregister_event; | 4976 | event->unregister_event = mem_cgroup_oom_unregister_event; |
4965 | } else if (!strcmp(name, "memory.pressure_level")) { | 4977 | } else if (!strcmp(name, "memory.pressure_level")) { |
4966 | event->register_event = vmpressure_register_event; | 4978 | event->register_event = vmpressure_register_event; |
4967 | event->unregister_event = vmpressure_unregister_event; | 4979 | event->unregister_event = vmpressure_unregister_event; |
4968 | } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { | 4980 | } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { |
4969 | event->register_event = memsw_cgroup_usage_register_event; | 4981 | event->register_event = memsw_cgroup_usage_register_event; |
4970 | event->unregister_event = memsw_cgroup_usage_unregister_event; | 4982 | event->unregister_event = memsw_cgroup_usage_unregister_event; |
4971 | } else { | 4983 | } else { |
4972 | ret = -EINVAL; | 4984 | ret = -EINVAL; |
4973 | goto out_put_cfile; | 4985 | goto out_put_cfile; |
4974 | } | 4986 | } |
4975 | 4987 | ||
4976 | /* | 4988 | /* |
4977 | * Verify @cfile should belong to @css. Also, remaining events are | 4989 | * Verify @cfile should belong to @css. Also, remaining events are |
4978 | * automatically removed on cgroup destruction but the removal is | 4990 | * automatically removed on cgroup destruction but the removal is |
4979 | * asynchronous, so take an extra ref on @css. | 4991 | * asynchronous, so take an extra ref on @css. |
4980 | */ | 4992 | */ |
4981 | cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, | 4993 | cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, |
4982 | &memory_cgrp_subsys); | 4994 | &memory_cgrp_subsys); |
4983 | ret = -EINVAL; | 4995 | ret = -EINVAL; |
4984 | if (IS_ERR(cfile_css)) | 4996 | if (IS_ERR(cfile_css)) |
4985 | goto out_put_cfile; | 4997 | goto out_put_cfile; |
4986 | if (cfile_css != css) { | 4998 | if (cfile_css != css) { |
4987 | css_put(cfile_css); | 4999 | css_put(cfile_css); |
4988 | goto out_put_cfile; | 5000 | goto out_put_cfile; |
4989 | } | 5001 | } |
4990 | 5002 | ||
4991 | ret = event->register_event(memcg, event->eventfd, buf); | 5003 | ret = event->register_event(memcg, event->eventfd, buf); |
4992 | if (ret) | 5004 | if (ret) |
4993 | goto out_put_css; | 5005 | goto out_put_css; |
4994 | 5006 | ||
4995 | efile.file->f_op->poll(efile.file, &event->pt); | 5007 | efile.file->f_op->poll(efile.file, &event->pt); |
4996 | 5008 | ||
4997 | spin_lock(&memcg->event_list_lock); | 5009 | spin_lock(&memcg->event_list_lock); |
4998 | list_add(&event->list, &memcg->event_list); | 5010 | list_add(&event->list, &memcg->event_list); |
4999 | spin_unlock(&memcg->event_list_lock); | 5011 | spin_unlock(&memcg->event_list_lock); |
5000 | 5012 | ||
5001 | fdput(cfile); | 5013 | fdput(cfile); |
5002 | fdput(efile); | 5014 | fdput(efile); |
5003 | 5015 | ||
5004 | return nbytes; | 5016 | return nbytes; |
5005 | 5017 | ||
5006 | out_put_css: | 5018 | out_put_css: |
5007 | css_put(css); | 5019 | css_put(css); |
5008 | out_put_cfile: | 5020 | out_put_cfile: |
5009 | fdput(cfile); | 5021 | fdput(cfile); |
5010 | out_put_eventfd: | 5022 | out_put_eventfd: |
5011 | eventfd_ctx_put(event->eventfd); | 5023 | eventfd_ctx_put(event->eventfd); |
5012 | out_put_efile: | 5024 | out_put_efile: |
5013 | fdput(efile); | 5025 | fdput(efile); |
5014 | out_kfree: | 5026 | out_kfree: |
5015 | kfree(event); | 5027 | kfree(event); |
5016 | 5028 | ||
5017 | return ret; | 5029 | return ret; |
5018 | } | 5030 | } |
5019 | 5031 | ||
5020 | static struct cftype mem_cgroup_files[] = { | 5032 | static struct cftype mem_cgroup_files[] = { |
5021 | { | 5033 | { |
5022 | .name = "usage_in_bytes", | 5034 | .name = "usage_in_bytes", |
5023 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 5035 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
5024 | .read_u64 = mem_cgroup_read_u64, | 5036 | .read_u64 = mem_cgroup_read_u64, |
5025 | }, | 5037 | }, |
5026 | { | 5038 | { |
5027 | .name = "max_usage_in_bytes", | 5039 | .name = "max_usage_in_bytes", |
5028 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), | 5040 | .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), |
5029 | .write = mem_cgroup_reset, | 5041 | .write = mem_cgroup_reset, |
5030 | .read_u64 = mem_cgroup_read_u64, | 5042 | .read_u64 = mem_cgroup_read_u64, |
5031 | }, | 5043 | }, |
5032 | { | 5044 | { |
5033 | .name = "limit_in_bytes", | 5045 | .name = "limit_in_bytes", |
5034 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), | 5046 | .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), |
5035 | .write = mem_cgroup_write, | 5047 | .write = mem_cgroup_write, |
5036 | .read_u64 = mem_cgroup_read_u64, | 5048 | .read_u64 = mem_cgroup_read_u64, |
5037 | }, | 5049 | }, |
5038 | { | 5050 | { |
5039 | .name = "soft_limit_in_bytes", | 5051 | .name = "soft_limit_in_bytes", |
5040 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | 5052 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), |
5041 | .write = mem_cgroup_write, | 5053 | .write = mem_cgroup_write, |
5042 | .read_u64 = mem_cgroup_read_u64, | 5054 | .read_u64 = mem_cgroup_read_u64, |
5043 | }, | 5055 | }, |
5044 | { | 5056 | { |
5045 | .name = "failcnt", | 5057 | .name = "failcnt", |
5046 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 5058 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
5047 | .write = mem_cgroup_reset, | 5059 | .write = mem_cgroup_reset, |
5048 | .read_u64 = mem_cgroup_read_u64, | 5060 | .read_u64 = mem_cgroup_read_u64, |
5049 | }, | 5061 | }, |
5050 | { | 5062 | { |
5051 | .name = "stat", | 5063 | .name = "stat", |
5052 | .seq_show = memcg_stat_show, | 5064 | .seq_show = memcg_stat_show, |
5053 | }, | 5065 | }, |
5054 | { | 5066 | { |
5055 | .name = "force_empty", | 5067 | .name = "force_empty", |
5056 | .write = mem_cgroup_force_empty_write, | 5068 | .write = mem_cgroup_force_empty_write, |
5057 | }, | 5069 | }, |
5058 | { | 5070 | { |
5059 | .name = "use_hierarchy", | 5071 | .name = "use_hierarchy", |
5060 | .write_u64 = mem_cgroup_hierarchy_write, | 5072 | .write_u64 = mem_cgroup_hierarchy_write, |
5061 | .read_u64 = mem_cgroup_hierarchy_read, | 5073 | .read_u64 = mem_cgroup_hierarchy_read, |
5062 | }, | 5074 | }, |
5063 | { | 5075 | { |
5064 | .name = "cgroup.event_control", /* XXX: for compat */ | 5076 | .name = "cgroup.event_control", /* XXX: for compat */ |
5065 | .write = memcg_write_event_control, | 5077 | .write = memcg_write_event_control, |
5066 | .flags = CFTYPE_NO_PREFIX, | 5078 | .flags = CFTYPE_NO_PREFIX, |
5067 | .mode = S_IWUGO, | 5079 | .mode = S_IWUGO, |
5068 | }, | 5080 | }, |
5069 | { | 5081 | { |
5070 | .name = "swappiness", | 5082 | .name = "swappiness", |
5071 | .read_u64 = mem_cgroup_swappiness_read, | 5083 | .read_u64 = mem_cgroup_swappiness_read, |
5072 | .write_u64 = mem_cgroup_swappiness_write, | 5084 | .write_u64 = mem_cgroup_swappiness_write, |
5073 | }, | 5085 | }, |
5074 | { | 5086 | { |
5075 | .name = "move_charge_at_immigrate", | 5087 | .name = "move_charge_at_immigrate", |
5076 | .read_u64 = mem_cgroup_move_charge_read, | 5088 | .read_u64 = mem_cgroup_move_charge_read, |
5077 | .write_u64 = mem_cgroup_move_charge_write, | 5089 | .write_u64 = mem_cgroup_move_charge_write, |
5078 | }, | 5090 | }, |
5079 | { | 5091 | { |
5080 | .name = "oom_control", | 5092 | .name = "oom_control", |
5081 | .seq_show = mem_cgroup_oom_control_read, | 5093 | .seq_show = mem_cgroup_oom_control_read, |
5082 | .write_u64 = mem_cgroup_oom_control_write, | 5094 | .write_u64 = mem_cgroup_oom_control_write, |
5083 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | 5095 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), |
5084 | }, | 5096 | }, |
5085 | { | 5097 | { |
5086 | .name = "pressure_level", | 5098 | .name = "pressure_level", |
5087 | }, | 5099 | }, |
5088 | #ifdef CONFIG_NUMA | 5100 | #ifdef CONFIG_NUMA |
5089 | { | 5101 | { |
5090 | .name = "numa_stat", | 5102 | .name = "numa_stat", |
5091 | .seq_show = memcg_numa_stat_show, | 5103 | .seq_show = memcg_numa_stat_show, |
5092 | }, | 5104 | }, |
5093 | #endif | 5105 | #endif |
5094 | #ifdef CONFIG_MEMCG_KMEM | 5106 | #ifdef CONFIG_MEMCG_KMEM |
5095 | { | 5107 | { |
5096 | .name = "kmem.limit_in_bytes", | 5108 | .name = "kmem.limit_in_bytes", |
5097 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), | 5109 | .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), |
5098 | .write = mem_cgroup_write, | 5110 | .write = mem_cgroup_write, |
5099 | .read_u64 = mem_cgroup_read_u64, | 5111 | .read_u64 = mem_cgroup_read_u64, |
5100 | }, | 5112 | }, |
5101 | { | 5113 | { |
5102 | .name = "kmem.usage_in_bytes", | 5114 | .name = "kmem.usage_in_bytes", |
5103 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), | 5115 | .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), |
5104 | .read_u64 = mem_cgroup_read_u64, | 5116 | .read_u64 = mem_cgroup_read_u64, |
5105 | }, | 5117 | }, |
5106 | { | 5118 | { |
5107 | .name = "kmem.failcnt", | 5119 | .name = "kmem.failcnt", |
5108 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), | 5120 | .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), |
5109 | .write = mem_cgroup_reset, | 5121 | .write = mem_cgroup_reset, |
5110 | .read_u64 = mem_cgroup_read_u64, | 5122 | .read_u64 = mem_cgroup_read_u64, |
5111 | }, | 5123 | }, |
5112 | { | 5124 | { |
5113 | .name = "kmem.max_usage_in_bytes", | 5125 | .name = "kmem.max_usage_in_bytes", |
5114 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), | 5126 | .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), |
5115 | .write = mem_cgroup_reset, | 5127 | .write = mem_cgroup_reset, |
5116 | .read_u64 = mem_cgroup_read_u64, | 5128 | .read_u64 = mem_cgroup_read_u64, |
5117 | }, | 5129 | }, |
5118 | #ifdef CONFIG_SLABINFO | 5130 | #ifdef CONFIG_SLABINFO |
5119 | { | 5131 | { |
5120 | .name = "kmem.slabinfo", | 5132 | .name = "kmem.slabinfo", |
5121 | .seq_show = mem_cgroup_slabinfo_read, | 5133 | .seq_show = mem_cgroup_slabinfo_read, |
5122 | }, | 5134 | }, |
5123 | #endif | 5135 | #endif |
5124 | #endif | 5136 | #endif |
5125 | { }, /* terminate */ | 5137 | { }, /* terminate */ |
5126 | }; | 5138 | }; |
5127 | 5139 | ||
5128 | #ifdef CONFIG_MEMCG_SWAP | 5140 | #ifdef CONFIG_MEMCG_SWAP |
5129 | static struct cftype memsw_cgroup_files[] = { | 5141 | static struct cftype memsw_cgroup_files[] = { |
5130 | { | 5142 | { |
5131 | .name = "memsw.usage_in_bytes", | 5143 | .name = "memsw.usage_in_bytes", |
5132 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 5144 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
5133 | .read_u64 = mem_cgroup_read_u64, | 5145 | .read_u64 = mem_cgroup_read_u64, |
5134 | }, | 5146 | }, |
5135 | { | 5147 | { |
5136 | .name = "memsw.max_usage_in_bytes", | 5148 | .name = "memsw.max_usage_in_bytes", |
5137 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), | 5149 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), |
5138 | .write = mem_cgroup_reset, | 5150 | .write = mem_cgroup_reset, |
5139 | .read_u64 = mem_cgroup_read_u64, | 5151 | .read_u64 = mem_cgroup_read_u64, |
5140 | }, | 5152 | }, |
5141 | { | 5153 | { |
5142 | .name = "memsw.limit_in_bytes", | 5154 | .name = "memsw.limit_in_bytes", |
5143 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), | 5155 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), |
5144 | .write = mem_cgroup_write, | 5156 | .write = mem_cgroup_write, |
5145 | .read_u64 = mem_cgroup_read_u64, | 5157 | .read_u64 = mem_cgroup_read_u64, |
5146 | }, | 5158 | }, |
5147 | { | 5159 | { |
5148 | .name = "memsw.failcnt", | 5160 | .name = "memsw.failcnt", |
5149 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), | 5161 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), |
5150 | .write = mem_cgroup_reset, | 5162 | .write = mem_cgroup_reset, |
5151 | .read_u64 = mem_cgroup_read_u64, | 5163 | .read_u64 = mem_cgroup_read_u64, |
5152 | }, | 5164 | }, |
5153 | { }, /* terminate */ | 5165 | { }, /* terminate */ |
5154 | }; | 5166 | }; |
5155 | #endif | 5167 | #endif |
5156 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 5168 | static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
5157 | { | 5169 | { |
5158 | struct mem_cgroup_per_node *pn; | 5170 | struct mem_cgroup_per_node *pn; |
5159 | struct mem_cgroup_per_zone *mz; | 5171 | struct mem_cgroup_per_zone *mz; |
5160 | int zone, tmp = node; | 5172 | int zone, tmp = node; |
5161 | /* | 5173 | /* |
5162 | * This routine is called against possible nodes. | 5174 | * This routine is called against possible nodes. |
5163 | * But it's BUG to call kmalloc() against offline node. | 5175 | * But it's BUG to call kmalloc() against offline node. |
5164 | * | 5176 | * |
5165 | * TODO: this routine can waste much memory for nodes which will | 5177 | * TODO: this routine can waste much memory for nodes which will |
5166 | * never be onlined. It's better to use memory hotplug callback | 5178 | * never be onlined. It's better to use memory hotplug callback |
5167 | * function. | 5179 | * function. |
5168 | */ | 5180 | */ |
5169 | if (!node_state(node, N_NORMAL_MEMORY)) | 5181 | if (!node_state(node, N_NORMAL_MEMORY)) |
5170 | tmp = -1; | 5182 | tmp = -1; |
5171 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); | 5183 | pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); |
5172 | if (!pn) | 5184 | if (!pn) |
5173 | return 1; | 5185 | return 1; |
5174 | 5186 | ||
5175 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 5187 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
5176 | mz = &pn->zoneinfo[zone]; | 5188 | mz = &pn->zoneinfo[zone]; |
5177 | lruvec_init(&mz->lruvec); | 5189 | lruvec_init(&mz->lruvec); |
5178 | mz->usage_in_excess = 0; | 5190 | mz->usage_in_excess = 0; |
5179 | mz->on_tree = false; | 5191 | mz->on_tree = false; |
5180 | mz->memcg = memcg; | 5192 | mz->memcg = memcg; |
5181 | } | 5193 | } |
5182 | memcg->nodeinfo[node] = pn; | 5194 | memcg->nodeinfo[node] = pn; |
5183 | return 0; | 5195 | return 0; |
5184 | } | 5196 | } |
5185 | 5197 | ||
5186 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) | 5198 | static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) |
5187 | { | 5199 | { |
5188 | kfree(memcg->nodeinfo[node]); | 5200 | kfree(memcg->nodeinfo[node]); |
5189 | } | 5201 | } |
5190 | 5202 | ||
5191 | static struct mem_cgroup *mem_cgroup_alloc(void) | 5203 | static struct mem_cgroup *mem_cgroup_alloc(void) |
5192 | { | 5204 | { |
5193 | struct mem_cgroup *memcg; | 5205 | struct mem_cgroup *memcg; |
5194 | size_t size; | 5206 | size_t size; |
5195 | 5207 | ||
5196 | size = sizeof(struct mem_cgroup); | 5208 | size = sizeof(struct mem_cgroup); |
5197 | size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); | 5209 | size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); |
5198 | 5210 | ||
5199 | memcg = kzalloc(size, GFP_KERNEL); | 5211 | memcg = kzalloc(size, GFP_KERNEL); |
5200 | if (!memcg) | 5212 | if (!memcg) |
5201 | return NULL; | 5213 | return NULL; |
5202 | 5214 | ||
5203 | memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | 5215 | memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); |
5204 | if (!memcg->stat) | 5216 | if (!memcg->stat) |
5205 | goto out_free; | 5217 | goto out_free; |
5206 | spin_lock_init(&memcg->pcp_counter_lock); | 5218 | spin_lock_init(&memcg->pcp_counter_lock); |
5207 | return memcg; | 5219 | return memcg; |
5208 | 5220 | ||
5209 | out_free: | 5221 | out_free: |
5210 | kfree(memcg); | 5222 | kfree(memcg); |
5211 | return NULL; | 5223 | return NULL; |
5212 | } | 5224 | } |
5213 | 5225 | ||
5214 | /* | 5226 | /* |
5215 | * At destroying mem_cgroup, references from swap_cgroup can remain. | 5227 | * At destroying mem_cgroup, references from swap_cgroup can remain. |
5216 | * (scanning all at force_empty is too costly...) | 5228 | * (scanning all at force_empty is too costly...) |
5217 | * | 5229 | * |
5218 | * Instead of clearing all references at force_empty, we remember | 5230 | * Instead of clearing all references at force_empty, we remember |
5219 | * the number of reference from swap_cgroup and free mem_cgroup when | 5231 | * the number of reference from swap_cgroup and free mem_cgroup when |
5220 | * it goes down to 0. | 5232 | * it goes down to 0. |
5221 | * | 5233 | * |
5222 | * Removal of cgroup itself succeeds regardless of refs from swap. | 5234 | * Removal of cgroup itself succeeds regardless of refs from swap. |
5223 | */ | 5235 | */ |
5224 | 5236 | ||
5225 | static void __mem_cgroup_free(struct mem_cgroup *memcg) | 5237 | static void __mem_cgroup_free(struct mem_cgroup *memcg) |
5226 | { | 5238 | { |
5227 | int node; | 5239 | int node; |
5228 | 5240 | ||
5229 | mem_cgroup_remove_from_trees(memcg); | 5241 | mem_cgroup_remove_from_trees(memcg); |
5230 | 5242 | ||
5231 | for_each_node(node) | 5243 | for_each_node(node) |
5232 | free_mem_cgroup_per_zone_info(memcg, node); | 5244 | free_mem_cgroup_per_zone_info(memcg, node); |
5233 | 5245 | ||
5234 | free_percpu(memcg->stat); | 5246 | free_percpu(memcg->stat); |
5235 | 5247 | ||
5236 | /* | 5248 | /* |
5237 | * We need to make sure that (at least for now), the jump label | 5249 | * We need to make sure that (at least for now), the jump label |
5238 | * destruction code runs outside of the cgroup lock. This is because | 5250 | * destruction code runs outside of the cgroup lock. This is because |
5239 | * get_online_cpus(), which is called from the static_branch update, | 5251 | * get_online_cpus(), which is called from the static_branch update, |
5240 | * can't be called inside the cgroup_lock. cpusets are the ones | 5252 | * can't be called inside the cgroup_lock. cpusets are the ones |
5241 | * enforcing this dependency, so if they ever change, we might as well. | 5253 | * enforcing this dependency, so if they ever change, we might as well. |
5242 | * | 5254 | * |
5243 | * schedule_work() will guarantee this happens. Be careful if you need | 5255 | * schedule_work() will guarantee this happens. Be careful if you need |
5244 | * to move this code around, and make sure it is outside | 5256 | * to move this code around, and make sure it is outside |
5245 | * the cgroup_lock. | 5257 | * the cgroup_lock. |
5246 | */ | 5258 | */ |
5247 | disarm_static_keys(memcg); | 5259 | disarm_static_keys(memcg); |
5248 | kfree(memcg); | 5260 | kfree(memcg); |
5249 | } | 5261 | } |
5250 | 5262 | ||
5251 | /* | 5263 | /* |
5252 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 5264 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
5253 | */ | 5265 | */ |
5254 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) | 5266 | struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) |
5255 | { | 5267 | { |
5256 | if (!memcg->memory.parent) | 5268 | if (!memcg->memory.parent) |
5257 | return NULL; | 5269 | return NULL; |
5258 | return mem_cgroup_from_counter(memcg->memory.parent, memory); | 5270 | return mem_cgroup_from_counter(memcg->memory.parent, memory); |
5259 | } | 5271 | } |
5260 | EXPORT_SYMBOL(parent_mem_cgroup); | 5272 | EXPORT_SYMBOL(parent_mem_cgroup); |
5261 | 5273 | ||
5262 | static void __init mem_cgroup_soft_limit_tree_init(void) | 5274 | static void __init mem_cgroup_soft_limit_tree_init(void) |
5263 | { | 5275 | { |
5264 | struct mem_cgroup_tree_per_node *rtpn; | 5276 | struct mem_cgroup_tree_per_node *rtpn; |
5265 | struct mem_cgroup_tree_per_zone *rtpz; | 5277 | struct mem_cgroup_tree_per_zone *rtpz; |
5266 | int tmp, node, zone; | 5278 | int tmp, node, zone; |
5267 | 5279 | ||
5268 | for_each_node(node) { | 5280 | for_each_node(node) { |
5269 | tmp = node; | 5281 | tmp = node; |
5270 | if (!node_state(node, N_NORMAL_MEMORY)) | 5282 | if (!node_state(node, N_NORMAL_MEMORY)) |
5271 | tmp = -1; | 5283 | tmp = -1; |
5272 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | 5284 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); |
5273 | BUG_ON(!rtpn); | 5285 | BUG_ON(!rtpn); |
5274 | 5286 | ||
5275 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | 5287 | soft_limit_tree.rb_tree_per_node[node] = rtpn; |
5276 | 5288 | ||
5277 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | 5289 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { |
5278 | rtpz = &rtpn->rb_tree_per_zone[zone]; | 5290 | rtpz = &rtpn->rb_tree_per_zone[zone]; |
5279 | rtpz->rb_root = RB_ROOT; | 5291 | rtpz->rb_root = RB_ROOT; |
5280 | spin_lock_init(&rtpz->lock); | 5292 | spin_lock_init(&rtpz->lock); |
5281 | } | 5293 | } |
5282 | } | 5294 | } |
5283 | } | 5295 | } |
5284 | 5296 | ||
5285 | static struct cgroup_subsys_state * __ref | 5297 | static struct cgroup_subsys_state * __ref |
5286 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) | 5298 | mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) |
5287 | { | 5299 | { |
5288 | struct mem_cgroup *memcg; | 5300 | struct mem_cgroup *memcg; |
5289 | long error = -ENOMEM; | 5301 | long error = -ENOMEM; |
5290 | int node; | 5302 | int node; |
5291 | 5303 | ||
5292 | memcg = mem_cgroup_alloc(); | 5304 | memcg = mem_cgroup_alloc(); |
5293 | if (!memcg) | 5305 | if (!memcg) |
5294 | return ERR_PTR(error); | 5306 | return ERR_PTR(error); |
5295 | 5307 | ||
5296 | for_each_node(node) | 5308 | for_each_node(node) |
5297 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) | 5309 | if (alloc_mem_cgroup_per_zone_info(memcg, node)) |
5298 | goto free_out; | 5310 | goto free_out; |
5299 | 5311 | ||
5300 | /* root ? */ | 5312 | /* root ? */ |
5301 | if (parent_css == NULL) { | 5313 | if (parent_css == NULL) { |
5302 | root_mem_cgroup = memcg; | 5314 | root_mem_cgroup = memcg; |
5303 | page_counter_init(&memcg->memory, NULL); | 5315 | page_counter_init(&memcg->memory, NULL); |
5304 | page_counter_init(&memcg->memsw, NULL); | 5316 | page_counter_init(&memcg->memsw, NULL); |
5305 | page_counter_init(&memcg->kmem, NULL); | 5317 | page_counter_init(&memcg->kmem, NULL); |
5306 | } | 5318 | } |
5307 | 5319 | ||
5308 | memcg->last_scanned_node = MAX_NUMNODES; | 5320 | memcg->last_scanned_node = MAX_NUMNODES; |
5309 | INIT_LIST_HEAD(&memcg->oom_notify); | 5321 | INIT_LIST_HEAD(&memcg->oom_notify); |
5310 | memcg->move_charge_at_immigrate = 0; | 5322 | memcg->move_charge_at_immigrate = 0; |
5311 | mutex_init(&memcg->thresholds_lock); | 5323 | mutex_init(&memcg->thresholds_lock); |
5312 | spin_lock_init(&memcg->move_lock); | 5324 | spin_lock_init(&memcg->move_lock); |
5313 | vmpressure_init(&memcg->vmpressure); | 5325 | vmpressure_init(&memcg->vmpressure); |
5314 | INIT_LIST_HEAD(&memcg->event_list); | 5326 | INIT_LIST_HEAD(&memcg->event_list); |
5315 | spin_lock_init(&memcg->event_list_lock); | 5327 | spin_lock_init(&memcg->event_list_lock); |
5316 | 5328 | ||
5317 | return &memcg->css; | 5329 | return &memcg->css; |
5318 | 5330 | ||
5319 | free_out: | 5331 | free_out: |
5320 | __mem_cgroup_free(memcg); | 5332 | __mem_cgroup_free(memcg); |
5321 | return ERR_PTR(error); | 5333 | return ERR_PTR(error); |
5322 | } | 5334 | } |
5323 | 5335 | ||
5324 | static int | 5336 | static int |
5325 | mem_cgroup_css_online(struct cgroup_subsys_state *css) | 5337 | mem_cgroup_css_online(struct cgroup_subsys_state *css) |
5326 | { | 5338 | { |
5327 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5339 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5328 | struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); | 5340 | struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); |
5329 | int ret; | 5341 | int ret; |
5330 | 5342 | ||
5331 | if (css->id > MEM_CGROUP_ID_MAX) | 5343 | if (css->id > MEM_CGROUP_ID_MAX) |
5332 | return -ENOSPC; | 5344 | return -ENOSPC; |
5333 | 5345 | ||
5334 | if (!parent) | 5346 | if (!parent) |
5335 | return 0; | 5347 | return 0; |
5336 | 5348 | ||
5337 | mutex_lock(&memcg_create_mutex); | 5349 | mutex_lock(&memcg_create_mutex); |
5338 | 5350 | ||
5339 | memcg->use_hierarchy = parent->use_hierarchy; | 5351 | memcg->use_hierarchy = parent->use_hierarchy; |
5340 | memcg->oom_kill_disable = parent->oom_kill_disable; | 5352 | memcg->oom_kill_disable = parent->oom_kill_disable; |
5341 | memcg->swappiness = mem_cgroup_swappiness(parent); | 5353 | memcg->swappiness = mem_cgroup_swappiness(parent); |
5342 | 5354 | ||
5343 | if (parent->use_hierarchy) { | 5355 | if (parent->use_hierarchy) { |
5344 | page_counter_init(&memcg->memory, &parent->memory); | 5356 | page_counter_init(&memcg->memory, &parent->memory); |
5345 | page_counter_init(&memcg->memsw, &parent->memsw); | 5357 | page_counter_init(&memcg->memsw, &parent->memsw); |
5346 | page_counter_init(&memcg->kmem, &parent->kmem); | 5358 | page_counter_init(&memcg->kmem, &parent->kmem); |
5347 | 5359 | ||
5348 | /* | 5360 | /* |
5349 | * No need to take a reference to the parent because cgroup | 5361 | * No need to take a reference to the parent because cgroup |
5350 | * core guarantees its existence. | 5362 | * core guarantees its existence. |
5351 | */ | 5363 | */ |
5352 | } else { | 5364 | } else { |
5353 | page_counter_init(&memcg->memory, NULL); | 5365 | page_counter_init(&memcg->memory, NULL); |
5354 | page_counter_init(&memcg->memsw, NULL); | 5366 | page_counter_init(&memcg->memsw, NULL); |
5355 | page_counter_init(&memcg->kmem, NULL); | 5367 | page_counter_init(&memcg->kmem, NULL); |
5356 | /* | 5368 | /* |
5357 | * Deeper hierachy with use_hierarchy == false doesn't make | 5369 | * Deeper hierachy with use_hierarchy == false doesn't make |
5358 | * much sense so let cgroup subsystem know about this | 5370 | * much sense so let cgroup subsystem know about this |
5359 | * unfortunate state in our controller. | 5371 | * unfortunate state in our controller. |
5360 | */ | 5372 | */ |
5361 | if (parent != root_mem_cgroup) | 5373 | if (parent != root_mem_cgroup) |
5362 | memory_cgrp_subsys.broken_hierarchy = true; | 5374 | memory_cgrp_subsys.broken_hierarchy = true; |
5363 | } | 5375 | } |
5364 | mutex_unlock(&memcg_create_mutex); | 5376 | mutex_unlock(&memcg_create_mutex); |
5365 | 5377 | ||
5366 | ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); | 5378 | ret = memcg_init_kmem(memcg, &memory_cgrp_subsys); |
5367 | if (ret) | 5379 | if (ret) |
5368 | return ret; | 5380 | return ret; |
5369 | 5381 | ||
5370 | /* | 5382 | /* |
5371 | * Make sure the memcg is initialized: mem_cgroup_iter() | 5383 | * Make sure the memcg is initialized: mem_cgroup_iter() |
5372 | * orders reading memcg->initialized against its callers | 5384 | * orders reading memcg->initialized against its callers |
5373 | * reading the memcg members. | 5385 | * reading the memcg members. |
5374 | */ | 5386 | */ |
5375 | smp_store_release(&memcg->initialized, 1); | 5387 | smp_store_release(&memcg->initialized, 1); |
5376 | 5388 | ||
5377 | return 0; | 5389 | return 0; |
5378 | } | 5390 | } |
5379 | 5391 | ||
5380 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) | 5392 | static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) |
5381 | { | 5393 | { |
5382 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5394 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5383 | struct mem_cgroup_event *event, *tmp; | 5395 | struct mem_cgroup_event *event, *tmp; |
5384 | struct cgroup_subsys_state *iter; | 5396 | struct cgroup_subsys_state *iter; |
5385 | 5397 | ||
5386 | /* | 5398 | /* |
5387 | * Unregister events and notify userspace. | 5399 | * Unregister events and notify userspace. |
5388 | * Notify userspace about cgroup removing only after rmdir of cgroup | 5400 | * Notify userspace about cgroup removing only after rmdir of cgroup |
5389 | * directory to avoid race between userspace and kernelspace. | 5401 | * directory to avoid race between userspace and kernelspace. |
5390 | */ | 5402 | */ |
5391 | spin_lock(&memcg->event_list_lock); | 5403 | spin_lock(&memcg->event_list_lock); |
5392 | list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { | 5404 | list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { |
5393 | list_del_init(&event->list); | 5405 | list_del_init(&event->list); |
5394 | schedule_work(&event->remove); | 5406 | schedule_work(&event->remove); |
5395 | } | 5407 | } |
5396 | spin_unlock(&memcg->event_list_lock); | 5408 | spin_unlock(&memcg->event_list_lock); |
5397 | 5409 | ||
5398 | kmem_cgroup_css_offline(memcg); | 5410 | kmem_cgroup_css_offline(memcg); |
5399 | 5411 | ||
5400 | /* | 5412 | /* |
5401 | * This requires that offlining is serialized. Right now that is | 5413 | * This requires that offlining is serialized. Right now that is |
5402 | * guaranteed because css_killed_work_fn() holds the cgroup_mutex. | 5414 | * guaranteed because css_killed_work_fn() holds the cgroup_mutex. |
5403 | */ | 5415 | */ |
5404 | css_for_each_descendant_post(iter, css) | 5416 | css_for_each_descendant_post(iter, css) |
5405 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); | 5417 | mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); |
5406 | 5418 | ||
5407 | memcg_unregister_all_caches(memcg); | 5419 | memcg_unregister_all_caches(memcg); |
5408 | vmpressure_cleanup(&memcg->vmpressure); | 5420 | vmpressure_cleanup(&memcg->vmpressure); |
5409 | } | 5421 | } |
5410 | 5422 | ||
5411 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) | 5423 | static void mem_cgroup_css_free(struct cgroup_subsys_state *css) |
5412 | { | 5424 | { |
5413 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5425 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5414 | /* | 5426 | /* |
5415 | * XXX: css_offline() would be where we should reparent all | 5427 | * XXX: css_offline() would be where we should reparent all |
5416 | * memory to prepare the cgroup for destruction. However, | 5428 | * memory to prepare the cgroup for destruction. However, |
5417 | * memcg does not do css_tryget_online() and page_counter charging | 5429 | * memcg does not do css_tryget_online() and page_counter charging |
5418 | * under the same RCU lock region, which means that charging | 5430 | * under the same RCU lock region, which means that charging |
5419 | * could race with offlining. Offlining only happens to | 5431 | * could race with offlining. Offlining only happens to |
5420 | * cgroups with no tasks in them but charges can show up | 5432 | * cgroups with no tasks in them but charges can show up |
5421 | * without any tasks from the swapin path when the target | 5433 | * without any tasks from the swapin path when the target |
5422 | * memcg is looked up from the swapout record and not from the | 5434 | * memcg is looked up from the swapout record and not from the |
5423 | * current task as it usually is. A race like this can leak | 5435 | * current task as it usually is. A race like this can leak |
5424 | * charges and put pages with stale cgroup pointers into | 5436 | * charges and put pages with stale cgroup pointers into |
5425 | * circulation: | 5437 | * circulation: |
5426 | * | 5438 | * |
5427 | * #0 #1 | 5439 | * #0 #1 |
5428 | * lookup_swap_cgroup_id() | 5440 | * lookup_swap_cgroup_id() |
5429 | * rcu_read_lock() | 5441 | * rcu_read_lock() |
5430 | * mem_cgroup_lookup() | 5442 | * mem_cgroup_lookup() |
5431 | * css_tryget_online() | 5443 | * css_tryget_online() |
5432 | * rcu_read_unlock() | 5444 | * rcu_read_unlock() |
5433 | * disable css_tryget_online() | 5445 | * disable css_tryget_online() |
5434 | * call_rcu() | 5446 | * call_rcu() |
5435 | * offline_css() | 5447 | * offline_css() |
5436 | * reparent_charges() | 5448 | * reparent_charges() |
5437 | * page_counter_try_charge() | 5449 | * page_counter_try_charge() |
5438 | * css_put() | 5450 | * css_put() |
5439 | * css_free() | 5451 | * css_free() |
5440 | * pc->mem_cgroup = dead memcg | 5452 | * pc->mem_cgroup = dead memcg |
5441 | * add page to lru | 5453 | * add page to lru |
5442 | * | 5454 | * |
5443 | * The bulk of the charges are still moved in offline_css() to | 5455 | * The bulk of the charges are still moved in offline_css() to |
5444 | * avoid pinning a lot of pages in case a long-term reference | 5456 | * avoid pinning a lot of pages in case a long-term reference |
5445 | * like a swapout record is deferring the css_free() to long | 5457 | * like a swapout record is deferring the css_free() to long |
5446 | * after offlining. But this makes sure we catch any charges | 5458 | * after offlining. But this makes sure we catch any charges |
5447 | * made after offlining: | 5459 | * made after offlining: |
5448 | */ | 5460 | */ |
5449 | mem_cgroup_reparent_charges(memcg); | 5461 | mem_cgroup_reparent_charges(memcg); |
5450 | 5462 | ||
5451 | memcg_destroy_kmem(memcg); | 5463 | memcg_destroy_kmem(memcg); |
5452 | __mem_cgroup_free(memcg); | 5464 | __mem_cgroup_free(memcg); |
5453 | } | 5465 | } |
5454 | 5466 | ||
5455 | /** | 5467 | /** |
5456 | * mem_cgroup_css_reset - reset the states of a mem_cgroup | 5468 | * mem_cgroup_css_reset - reset the states of a mem_cgroup |
5457 | * @css: the target css | 5469 | * @css: the target css |
5458 | * | 5470 | * |
5459 | * Reset the states of the mem_cgroup associated with @css. This is | 5471 | * Reset the states of the mem_cgroup associated with @css. This is |
5460 | * invoked when the userland requests disabling on the default hierarchy | 5472 | * invoked when the userland requests disabling on the default hierarchy |
5461 | * but the memcg is pinned through dependency. The memcg should stop | 5473 | * but the memcg is pinned through dependency. The memcg should stop |
5462 | * applying policies and should revert to the vanilla state as it may be | 5474 | * applying policies and should revert to the vanilla state as it may be |
5463 | * made visible again. | 5475 | * made visible again. |
5464 | * | 5476 | * |
5465 | * The current implementation only resets the essential configurations. | 5477 | * The current implementation only resets the essential configurations. |
5466 | * This needs to be expanded to cover all the visible parts. | 5478 | * This needs to be expanded to cover all the visible parts. |
5467 | */ | 5479 | */ |
5468 | static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) | 5480 | static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) |
5469 | { | 5481 | { |
5470 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5482 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5471 | 5483 | ||
5472 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); | 5484 | mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); |
5473 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); | 5485 | mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); |
5474 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); | 5486 | memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); |
5475 | memcg->soft_limit = 0; | 5487 | memcg->soft_limit = 0; |
5476 | } | 5488 | } |
5477 | 5489 | ||
5478 | #ifdef CONFIG_MMU | 5490 | #ifdef CONFIG_MMU |
5479 | /* Handlers for move charge at task migration. */ | 5491 | /* Handlers for move charge at task migration. */ |
5480 | static int mem_cgroup_do_precharge(unsigned long count) | 5492 | static int mem_cgroup_do_precharge(unsigned long count) |
5481 | { | 5493 | { |
5482 | int ret; | 5494 | int ret; |
5483 | 5495 | ||
5484 | /* Try a single bulk charge without reclaim first */ | 5496 | /* Try a single bulk charge without reclaim first */ |
5485 | ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); | 5497 | ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); |
5486 | if (!ret) { | 5498 | if (!ret) { |
5487 | mc.precharge += count; | 5499 | mc.precharge += count; |
5488 | return ret; | 5500 | return ret; |
5489 | } | 5501 | } |
5490 | if (ret == -EINTR) { | 5502 | if (ret == -EINTR) { |
5491 | cancel_charge(root_mem_cgroup, count); | 5503 | cancel_charge(root_mem_cgroup, count); |
5492 | return ret; | 5504 | return ret; |
5493 | } | 5505 | } |
5494 | 5506 | ||
5495 | /* Try charges one by one with reclaim */ | 5507 | /* Try charges one by one with reclaim */ |
5496 | while (count--) { | 5508 | while (count--) { |
5497 | ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); | 5509 | ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); |
5498 | /* | 5510 | /* |
5499 | * In case of failure, any residual charges against | 5511 | * In case of failure, any residual charges against |
5500 | * mc.to will be dropped by mem_cgroup_clear_mc() | 5512 | * mc.to will be dropped by mem_cgroup_clear_mc() |
5501 | * later on. However, cancel any charges that are | 5513 | * later on. However, cancel any charges that are |
5502 | * bypassed to root right away or they'll be lost. | 5514 | * bypassed to root right away or they'll be lost. |
5503 | */ | 5515 | */ |
5504 | if (ret == -EINTR) | 5516 | if (ret == -EINTR) |
5505 | cancel_charge(root_mem_cgroup, 1); | 5517 | cancel_charge(root_mem_cgroup, 1); |
5506 | if (ret) | 5518 | if (ret) |
5507 | return ret; | 5519 | return ret; |
5508 | mc.precharge++; | 5520 | mc.precharge++; |
5509 | cond_resched(); | 5521 | cond_resched(); |
5510 | } | 5522 | } |
5511 | return 0; | 5523 | return 0; |
5512 | } | 5524 | } |
5513 | 5525 | ||
5514 | /** | 5526 | /** |
5515 | * get_mctgt_type - get target type of moving charge | 5527 | * get_mctgt_type - get target type of moving charge |
5516 | * @vma: the vma the pte to be checked belongs | 5528 | * @vma: the vma the pte to be checked belongs |
5517 | * @addr: the address corresponding to the pte to be checked | 5529 | * @addr: the address corresponding to the pte to be checked |
5518 | * @ptent: the pte to be checked | 5530 | * @ptent: the pte to be checked |
5519 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | 5531 | * @target: the pointer the target page or swap ent will be stored(can be NULL) |
5520 | * | 5532 | * |
5521 | * Returns | 5533 | * Returns |
5522 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | 5534 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. |
5523 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | 5535 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for |
5524 | * move charge. if @target is not NULL, the page is stored in target->page | 5536 | * move charge. if @target is not NULL, the page is stored in target->page |
5525 | * with extra refcnt got(Callers should handle it). | 5537 | * with extra refcnt got(Callers should handle it). |
5526 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | 5538 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a |
5527 | * target for charge migration. if @target is not NULL, the entry is stored | 5539 | * target for charge migration. if @target is not NULL, the entry is stored |
5528 | * in target->ent. | 5540 | * in target->ent. |
5529 | * | 5541 | * |
5530 | * Called with pte lock held. | 5542 | * Called with pte lock held. |
5531 | */ | 5543 | */ |
5532 | union mc_target { | 5544 | union mc_target { |
5533 | struct page *page; | 5545 | struct page *page; |
5534 | swp_entry_t ent; | 5546 | swp_entry_t ent; |
5535 | }; | 5547 | }; |
5536 | 5548 | ||
5537 | enum mc_target_type { | 5549 | enum mc_target_type { |
5538 | MC_TARGET_NONE = 0, | 5550 | MC_TARGET_NONE = 0, |
5539 | MC_TARGET_PAGE, | 5551 | MC_TARGET_PAGE, |
5540 | MC_TARGET_SWAP, | 5552 | MC_TARGET_SWAP, |
5541 | }; | 5553 | }; |
5542 | 5554 | ||
5543 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | 5555 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, |
5544 | unsigned long addr, pte_t ptent) | 5556 | unsigned long addr, pte_t ptent) |
5545 | { | 5557 | { |
5546 | struct page *page = vm_normal_page(vma, addr, ptent); | 5558 | struct page *page = vm_normal_page(vma, addr, ptent); |
5547 | 5559 | ||
5548 | if (!page || !page_mapped(page)) | 5560 | if (!page || !page_mapped(page)) |
5549 | return NULL; | 5561 | return NULL; |
5550 | if (PageAnon(page)) { | 5562 | if (PageAnon(page)) { |
5551 | /* we don't move shared anon */ | 5563 | /* we don't move shared anon */ |
5552 | if (!move_anon()) | 5564 | if (!move_anon()) |
5553 | return NULL; | 5565 | return NULL; |
5554 | } else if (!move_file()) | 5566 | } else if (!move_file()) |
5555 | /* we ignore mapcount for file pages */ | 5567 | /* we ignore mapcount for file pages */ |
5556 | return NULL; | 5568 | return NULL; |
5557 | if (!get_page_unless_zero(page)) | 5569 | if (!get_page_unless_zero(page)) |
5558 | return NULL; | 5570 | return NULL; |
5559 | 5571 | ||
5560 | return page; | 5572 | return page; |
5561 | } | 5573 | } |
5562 | 5574 | ||
5563 | #ifdef CONFIG_SWAP | 5575 | #ifdef CONFIG_SWAP |
5564 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 5576 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
5565 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5577 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5566 | { | 5578 | { |
5567 | struct page *page = NULL; | 5579 | struct page *page = NULL; |
5568 | swp_entry_t ent = pte_to_swp_entry(ptent); | 5580 | swp_entry_t ent = pte_to_swp_entry(ptent); |
5569 | 5581 | ||
5570 | if (!move_anon() || non_swap_entry(ent)) | 5582 | if (!move_anon() || non_swap_entry(ent)) |
5571 | return NULL; | 5583 | return NULL; |
5572 | /* | 5584 | /* |
5573 | * Because lookup_swap_cache() updates some statistics counter, | 5585 | * Because lookup_swap_cache() updates some statistics counter, |
5574 | * we call find_get_page() with swapper_space directly. | 5586 | * we call find_get_page() with swapper_space directly. |
5575 | */ | 5587 | */ |
5576 | page = find_get_page(swap_address_space(ent), ent.val); | 5588 | page = find_get_page(swap_address_space(ent), ent.val); |
5577 | if (do_swap_account) | 5589 | if (do_swap_account) |
5578 | entry->val = ent.val; | 5590 | entry->val = ent.val; |
5579 | 5591 | ||
5580 | return page; | 5592 | return page; |
5581 | } | 5593 | } |
5582 | #else | 5594 | #else |
5583 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | 5595 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, |
5584 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5596 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5585 | { | 5597 | { |
5586 | return NULL; | 5598 | return NULL; |
5587 | } | 5599 | } |
5588 | #endif | 5600 | #endif |
5589 | 5601 | ||
5590 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | 5602 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, |
5591 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | 5603 | unsigned long addr, pte_t ptent, swp_entry_t *entry) |
5592 | { | 5604 | { |
5593 | struct page *page = NULL; | 5605 | struct page *page = NULL; |
5594 | struct address_space *mapping; | 5606 | struct address_space *mapping; |
5595 | pgoff_t pgoff; | 5607 | pgoff_t pgoff; |
5596 | 5608 | ||
5597 | if (!vma->vm_file) /* anonymous vma */ | 5609 | if (!vma->vm_file) /* anonymous vma */ |
5598 | return NULL; | 5610 | return NULL; |
5599 | if (!move_file()) | 5611 | if (!move_file()) |
5600 | return NULL; | 5612 | return NULL; |
5601 | 5613 | ||
5602 | mapping = vma->vm_file->f_mapping; | 5614 | mapping = vma->vm_file->f_mapping; |
5603 | if (pte_none(ptent)) | 5615 | if (pte_none(ptent)) |
5604 | pgoff = linear_page_index(vma, addr); | 5616 | pgoff = linear_page_index(vma, addr); |
5605 | else /* pte_file(ptent) is true */ | 5617 | else /* pte_file(ptent) is true */ |
5606 | pgoff = pte_to_pgoff(ptent); | 5618 | pgoff = pte_to_pgoff(ptent); |
5607 | 5619 | ||
5608 | /* page is moved even if it's not RSS of this task(page-faulted). */ | 5620 | /* page is moved even if it's not RSS of this task(page-faulted). */ |
5609 | #ifdef CONFIG_SWAP | 5621 | #ifdef CONFIG_SWAP |
5610 | /* shmem/tmpfs may report page out on swap: account for that too. */ | 5622 | /* shmem/tmpfs may report page out on swap: account for that too. */ |
5611 | if (shmem_mapping(mapping)) { | 5623 | if (shmem_mapping(mapping)) { |
5612 | page = find_get_entry(mapping, pgoff); | 5624 | page = find_get_entry(mapping, pgoff); |
5613 | if (radix_tree_exceptional_entry(page)) { | 5625 | if (radix_tree_exceptional_entry(page)) { |
5614 | swp_entry_t swp = radix_to_swp_entry(page); | 5626 | swp_entry_t swp = radix_to_swp_entry(page); |
5615 | if (do_swap_account) | 5627 | if (do_swap_account) |
5616 | *entry = swp; | 5628 | *entry = swp; |
5617 | page = find_get_page(swap_address_space(swp), swp.val); | 5629 | page = find_get_page(swap_address_space(swp), swp.val); |
5618 | } | 5630 | } |
5619 | } else | 5631 | } else |
5620 | page = find_get_page(mapping, pgoff); | 5632 | page = find_get_page(mapping, pgoff); |
5621 | #else | 5633 | #else |
5622 | page = find_get_page(mapping, pgoff); | 5634 | page = find_get_page(mapping, pgoff); |
5623 | #endif | 5635 | #endif |
5624 | return page; | 5636 | return page; |
5625 | } | 5637 | } |
5626 | 5638 | ||
5627 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | 5639 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, |
5628 | unsigned long addr, pte_t ptent, union mc_target *target) | 5640 | unsigned long addr, pte_t ptent, union mc_target *target) |
5629 | { | 5641 | { |
5630 | struct page *page = NULL; | 5642 | struct page *page = NULL; |
5631 | struct page_cgroup *pc; | 5643 | struct page_cgroup *pc; |
5632 | enum mc_target_type ret = MC_TARGET_NONE; | 5644 | enum mc_target_type ret = MC_TARGET_NONE; |
5633 | swp_entry_t ent = { .val = 0 }; | 5645 | swp_entry_t ent = { .val = 0 }; |
5634 | 5646 | ||
5635 | if (pte_present(ptent)) | 5647 | if (pte_present(ptent)) |
5636 | page = mc_handle_present_pte(vma, addr, ptent); | 5648 | page = mc_handle_present_pte(vma, addr, ptent); |
5637 | else if (is_swap_pte(ptent)) | 5649 | else if (is_swap_pte(ptent)) |
5638 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); | 5650 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
5639 | else if (pte_none(ptent) || pte_file(ptent)) | 5651 | else if (pte_none(ptent) || pte_file(ptent)) |
5640 | page = mc_handle_file_pte(vma, addr, ptent, &ent); | 5652 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
5641 | 5653 | ||
5642 | if (!page && !ent.val) | 5654 | if (!page && !ent.val) |
5643 | return ret; | 5655 | return ret; |
5644 | if (page) { | 5656 | if (page) { |
5645 | pc = lookup_page_cgroup(page); | 5657 | pc = lookup_page_cgroup(page); |
5646 | /* | 5658 | /* |
5647 | * Do only loose check w/o serialization. | 5659 | * Do only loose check w/o serialization. |
5648 | * mem_cgroup_move_account() checks the pc is valid or | 5660 | * mem_cgroup_move_account() checks the pc is valid or |
5649 | * not under LRU exclusion. | 5661 | * not under LRU exclusion. |
5650 | */ | 5662 | */ |
5651 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | 5663 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { |
5652 | ret = MC_TARGET_PAGE; | 5664 | ret = MC_TARGET_PAGE; |
5653 | if (target) | 5665 | if (target) |
5654 | target->page = page; | 5666 | target->page = page; |
5655 | } | 5667 | } |
5656 | if (!ret || !target) | 5668 | if (!ret || !target) |
5657 | put_page(page); | 5669 | put_page(page); |
5658 | } | 5670 | } |
5659 | /* There is a swap entry and a page doesn't exist or isn't charged */ | 5671 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
5660 | if (ent.val && !ret && | 5672 | if (ent.val && !ret && |
5661 | mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { | 5673 | mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { |
5662 | ret = MC_TARGET_SWAP; | 5674 | ret = MC_TARGET_SWAP; |
5663 | if (target) | 5675 | if (target) |
5664 | target->ent = ent; | 5676 | target->ent = ent; |
5665 | } | 5677 | } |
5666 | return ret; | 5678 | return ret; |
5667 | } | 5679 | } |
5668 | 5680 | ||
5669 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 5681 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
5670 | /* | 5682 | /* |
5671 | * We don't consider swapping or file mapped pages because THP does not | 5683 | * We don't consider swapping or file mapped pages because THP does not |
5672 | * support them for now. | 5684 | * support them for now. |
5673 | * Caller should make sure that pmd_trans_huge(pmd) is true. | 5685 | * Caller should make sure that pmd_trans_huge(pmd) is true. |
5674 | */ | 5686 | */ |
5675 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | 5687 | static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, |
5676 | unsigned long addr, pmd_t pmd, union mc_target *target) | 5688 | unsigned long addr, pmd_t pmd, union mc_target *target) |
5677 | { | 5689 | { |
5678 | struct page *page = NULL; | 5690 | struct page *page = NULL; |
5679 | struct page_cgroup *pc; | 5691 | struct page_cgroup *pc; |
5680 | enum mc_target_type ret = MC_TARGET_NONE; | 5692 | enum mc_target_type ret = MC_TARGET_NONE; |
5681 | 5693 | ||
5682 | page = pmd_page(pmd); | 5694 | page = pmd_page(pmd); |
5683 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); | 5695 | VM_BUG_ON_PAGE(!page || !PageHead(page), page); |
5684 | if (!move_anon()) | 5696 | if (!move_anon()) |
5685 | return ret; | 5697 | return ret; |
5686 | pc = lookup_page_cgroup(page); | 5698 | pc = lookup_page_cgroup(page); |
5687 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | 5699 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { |
5688 | ret = MC_TARGET_PAGE; | 5700 | ret = MC_TARGET_PAGE; |
5689 | if (target) { | 5701 | if (target) { |
5690 | get_page(page); | 5702 | get_page(page); |
5691 | target->page = page; | 5703 | target->page = page; |
5692 | } | 5704 | } |
5693 | } | 5705 | } |
5694 | return ret; | 5706 | return ret; |
5695 | } | 5707 | } |
5696 | #else | 5708 | #else |
5697 | static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, | 5709 | static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, |
5698 | unsigned long addr, pmd_t pmd, union mc_target *target) | 5710 | unsigned long addr, pmd_t pmd, union mc_target *target) |
5699 | { | 5711 | { |
5700 | return MC_TARGET_NONE; | 5712 | return MC_TARGET_NONE; |
5701 | } | 5713 | } |
5702 | #endif | 5714 | #endif |
5703 | 5715 | ||
5704 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | 5716 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, |
5705 | unsigned long addr, unsigned long end, | 5717 | unsigned long addr, unsigned long end, |
5706 | struct mm_walk *walk) | 5718 | struct mm_walk *walk) |
5707 | { | 5719 | { |
5708 | struct vm_area_struct *vma = walk->private; | 5720 | struct vm_area_struct *vma = walk->private; |
5709 | pte_t *pte; | 5721 | pte_t *pte; |
5710 | spinlock_t *ptl; | 5722 | spinlock_t *ptl; |
5711 | 5723 | ||
5712 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 5724 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
5713 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | 5725 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) |
5714 | mc.precharge += HPAGE_PMD_NR; | 5726 | mc.precharge += HPAGE_PMD_NR; |
5715 | spin_unlock(ptl); | 5727 | spin_unlock(ptl); |
5716 | return 0; | 5728 | return 0; |
5717 | } | 5729 | } |
5718 | 5730 | ||
5719 | if (pmd_trans_unstable(pmd)) | 5731 | if (pmd_trans_unstable(pmd)) |
5720 | return 0; | 5732 | return 0; |
5721 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5733 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5722 | for (; addr != end; pte++, addr += PAGE_SIZE) | 5734 | for (; addr != end; pte++, addr += PAGE_SIZE) |
5723 | if (get_mctgt_type(vma, addr, *pte, NULL)) | 5735 | if (get_mctgt_type(vma, addr, *pte, NULL)) |
5724 | mc.precharge++; /* increment precharge temporarily */ | 5736 | mc.precharge++; /* increment precharge temporarily */ |
5725 | pte_unmap_unlock(pte - 1, ptl); | 5737 | pte_unmap_unlock(pte - 1, ptl); |
5726 | cond_resched(); | 5738 | cond_resched(); |
5727 | 5739 | ||
5728 | return 0; | 5740 | return 0; |
5729 | } | 5741 | } |
5730 | 5742 | ||
5731 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | 5743 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) |
5732 | { | 5744 | { |
5733 | unsigned long precharge; | 5745 | unsigned long precharge; |
5734 | struct vm_area_struct *vma; | 5746 | struct vm_area_struct *vma; |
5735 | 5747 | ||
5736 | down_read(&mm->mmap_sem); | 5748 | down_read(&mm->mmap_sem); |
5737 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 5749 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
5738 | struct mm_walk mem_cgroup_count_precharge_walk = { | 5750 | struct mm_walk mem_cgroup_count_precharge_walk = { |
5739 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | 5751 | .pmd_entry = mem_cgroup_count_precharge_pte_range, |
5740 | .mm = mm, | 5752 | .mm = mm, |
5741 | .private = vma, | 5753 | .private = vma, |
5742 | }; | 5754 | }; |
5743 | if (is_vm_hugetlb_page(vma)) | 5755 | if (is_vm_hugetlb_page(vma)) |
5744 | continue; | 5756 | continue; |
5745 | walk_page_range(vma->vm_start, vma->vm_end, | 5757 | walk_page_range(vma->vm_start, vma->vm_end, |
5746 | &mem_cgroup_count_precharge_walk); | 5758 | &mem_cgroup_count_precharge_walk); |
5747 | } | 5759 | } |
5748 | up_read(&mm->mmap_sem); | 5760 | up_read(&mm->mmap_sem); |
5749 | 5761 | ||
5750 | precharge = mc.precharge; | 5762 | precharge = mc.precharge; |
5751 | mc.precharge = 0; | 5763 | mc.precharge = 0; |
5752 | 5764 | ||
5753 | return precharge; | 5765 | return precharge; |
5754 | } | 5766 | } |
5755 | 5767 | ||
5756 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | 5768 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) |
5757 | { | 5769 | { |
5758 | unsigned long precharge = mem_cgroup_count_precharge(mm); | 5770 | unsigned long precharge = mem_cgroup_count_precharge(mm); |
5759 | 5771 | ||
5760 | VM_BUG_ON(mc.moving_task); | 5772 | VM_BUG_ON(mc.moving_task); |
5761 | mc.moving_task = current; | 5773 | mc.moving_task = current; |
5762 | return mem_cgroup_do_precharge(precharge); | 5774 | return mem_cgroup_do_precharge(precharge); |
5763 | } | 5775 | } |
5764 | 5776 | ||
5765 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ | 5777 | /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ |
5766 | static void __mem_cgroup_clear_mc(void) | 5778 | static void __mem_cgroup_clear_mc(void) |
5767 | { | 5779 | { |
5768 | struct mem_cgroup *from = mc.from; | 5780 | struct mem_cgroup *from = mc.from; |
5769 | struct mem_cgroup *to = mc.to; | 5781 | struct mem_cgroup *to = mc.to; |
5770 | int i; | ||
5771 | 5782 | ||
5772 | /* we must uncharge all the leftover precharges from mc.to */ | 5783 | /* we must uncharge all the leftover precharges from mc.to */ |
5773 | if (mc.precharge) { | 5784 | if (mc.precharge) { |
5774 | cancel_charge(mc.to, mc.precharge); | 5785 | cancel_charge(mc.to, mc.precharge); |
5775 | mc.precharge = 0; | 5786 | mc.precharge = 0; |
5776 | } | 5787 | } |
5777 | /* | 5788 | /* |
5778 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 5789 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
5779 | * we must uncharge here. | 5790 | * we must uncharge here. |
5780 | */ | 5791 | */ |
5781 | if (mc.moved_charge) { | 5792 | if (mc.moved_charge) { |
5782 | cancel_charge(mc.from, mc.moved_charge); | 5793 | cancel_charge(mc.from, mc.moved_charge); |
5783 | mc.moved_charge = 0; | 5794 | mc.moved_charge = 0; |
5784 | } | 5795 | } |
5785 | /* we must fixup refcnts and charges */ | 5796 | /* we must fixup refcnts and charges */ |
5786 | if (mc.moved_swap) { | 5797 | if (mc.moved_swap) { |
5787 | /* uncharge swap account from the old cgroup */ | 5798 | /* uncharge swap account from the old cgroup */ |
5788 | if (!mem_cgroup_is_root(mc.from)) | 5799 | if (!mem_cgroup_is_root(mc.from)) |
5789 | page_counter_uncharge(&mc.from->memsw, mc.moved_swap); | 5800 | page_counter_uncharge(&mc.from->memsw, mc.moved_swap); |
5790 | 5801 | ||
5791 | /* | 5802 | /* |
5792 | * we charged both to->memory and to->memsw, so we | 5803 | * we charged both to->memory and to->memsw, so we |
5793 | * should uncharge to->memory. | 5804 | * should uncharge to->memory. |
5794 | */ | 5805 | */ |
5795 | if (!mem_cgroup_is_root(mc.to)) | 5806 | if (!mem_cgroup_is_root(mc.to)) |
5796 | page_counter_uncharge(&mc.to->memory, mc.moved_swap); | 5807 | page_counter_uncharge(&mc.to->memory, mc.moved_swap); |
5797 | 5808 | ||
5798 | for (i = 0; i < mc.moved_swap; i++) | 5809 | css_put_many(&mc.from->css, mc.moved_swap); |
5799 | css_put(&mc.from->css); | ||
5800 | 5810 | ||
5801 | /* we've already done css_get(mc.to) */ | 5811 | /* we've already done css_get(mc.to) */ |
5802 | mc.moved_swap = 0; | 5812 | mc.moved_swap = 0; |
5803 | } | 5813 | } |
5804 | memcg_oom_recover(from); | 5814 | memcg_oom_recover(from); |
5805 | memcg_oom_recover(to); | 5815 | memcg_oom_recover(to); |
5806 | wake_up_all(&mc.waitq); | 5816 | wake_up_all(&mc.waitq); |
5807 | } | 5817 | } |
5808 | 5818 | ||
5809 | static void mem_cgroup_clear_mc(void) | 5819 | static void mem_cgroup_clear_mc(void) |
5810 | { | 5820 | { |
5811 | struct mem_cgroup *from = mc.from; | 5821 | struct mem_cgroup *from = mc.from; |
5812 | 5822 | ||
5813 | /* | 5823 | /* |
5814 | * we must clear moving_task before waking up waiters at the end of | 5824 | * we must clear moving_task before waking up waiters at the end of |
5815 | * task migration. | 5825 | * task migration. |
5816 | */ | 5826 | */ |
5817 | mc.moving_task = NULL; | 5827 | mc.moving_task = NULL; |
5818 | __mem_cgroup_clear_mc(); | 5828 | __mem_cgroup_clear_mc(); |
5819 | spin_lock(&mc.lock); | 5829 | spin_lock(&mc.lock); |
5820 | mc.from = NULL; | 5830 | mc.from = NULL; |
5821 | mc.to = NULL; | 5831 | mc.to = NULL; |
5822 | spin_unlock(&mc.lock); | 5832 | spin_unlock(&mc.lock); |
5823 | mem_cgroup_end_move(from); | 5833 | mem_cgroup_end_move(from); |
5824 | } | 5834 | } |
5825 | 5835 | ||
5826 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | 5836 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, |
5827 | struct cgroup_taskset *tset) | 5837 | struct cgroup_taskset *tset) |
5828 | { | 5838 | { |
5829 | struct task_struct *p = cgroup_taskset_first(tset); | 5839 | struct task_struct *p = cgroup_taskset_first(tset); |
5830 | int ret = 0; | 5840 | int ret = 0; |
5831 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); | 5841 | struct mem_cgroup *memcg = mem_cgroup_from_css(css); |
5832 | unsigned long move_charge_at_immigrate; | 5842 | unsigned long move_charge_at_immigrate; |
5833 | 5843 | ||
5834 | /* | 5844 | /* |
5835 | * We are now commited to this value whatever it is. Changes in this | 5845 | * We are now commited to this value whatever it is. Changes in this |
5836 | * tunable will only affect upcoming migrations, not the current one. | 5846 | * tunable will only affect upcoming migrations, not the current one. |
5837 | * So we need to save it, and keep it going. | 5847 | * So we need to save it, and keep it going. |
5838 | */ | 5848 | */ |
5839 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; | 5849 | move_charge_at_immigrate = memcg->move_charge_at_immigrate; |
5840 | if (move_charge_at_immigrate) { | 5850 | if (move_charge_at_immigrate) { |
5841 | struct mm_struct *mm; | 5851 | struct mm_struct *mm; |
5842 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5852 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
5843 | 5853 | ||
5844 | VM_BUG_ON(from == memcg); | 5854 | VM_BUG_ON(from == memcg); |
5845 | 5855 | ||
5846 | mm = get_task_mm(p); | 5856 | mm = get_task_mm(p); |
5847 | if (!mm) | 5857 | if (!mm) |
5848 | return 0; | 5858 | return 0; |
5849 | /* We move charges only when we move a owner of the mm */ | 5859 | /* We move charges only when we move a owner of the mm */ |
5850 | if (mm->owner == p) { | 5860 | if (mm->owner == p) { |
5851 | VM_BUG_ON(mc.from); | 5861 | VM_BUG_ON(mc.from); |
5852 | VM_BUG_ON(mc.to); | 5862 | VM_BUG_ON(mc.to); |
5853 | VM_BUG_ON(mc.precharge); | 5863 | VM_BUG_ON(mc.precharge); |
5854 | VM_BUG_ON(mc.moved_charge); | 5864 | VM_BUG_ON(mc.moved_charge); |
5855 | VM_BUG_ON(mc.moved_swap); | 5865 | VM_BUG_ON(mc.moved_swap); |
5856 | mem_cgroup_start_move(from); | 5866 | mem_cgroup_start_move(from); |
5857 | spin_lock(&mc.lock); | 5867 | spin_lock(&mc.lock); |
5858 | mc.from = from; | 5868 | mc.from = from; |
5859 | mc.to = memcg; | 5869 | mc.to = memcg; |
5860 | mc.immigrate_flags = move_charge_at_immigrate; | 5870 | mc.immigrate_flags = move_charge_at_immigrate; |
5861 | spin_unlock(&mc.lock); | 5871 | spin_unlock(&mc.lock); |
5862 | /* We set mc.moving_task later */ | 5872 | /* We set mc.moving_task later */ |
5863 | 5873 | ||
5864 | ret = mem_cgroup_precharge_mc(mm); | 5874 | ret = mem_cgroup_precharge_mc(mm); |
5865 | if (ret) | 5875 | if (ret) |
5866 | mem_cgroup_clear_mc(); | 5876 | mem_cgroup_clear_mc(); |
5867 | } | 5877 | } |
5868 | mmput(mm); | 5878 | mmput(mm); |
5869 | } | 5879 | } |
5870 | return ret; | 5880 | return ret; |
5871 | } | 5881 | } |
5872 | 5882 | ||
5873 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, | 5883 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, |
5874 | struct cgroup_taskset *tset) | 5884 | struct cgroup_taskset *tset) |
5875 | { | 5885 | { |
5876 | mem_cgroup_clear_mc(); | 5886 | mem_cgroup_clear_mc(); |
5877 | } | 5887 | } |
5878 | 5888 | ||
5879 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | 5889 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, |
5880 | unsigned long addr, unsigned long end, | 5890 | unsigned long addr, unsigned long end, |
5881 | struct mm_walk *walk) | 5891 | struct mm_walk *walk) |
5882 | { | 5892 | { |
5883 | int ret = 0; | 5893 | int ret = 0; |
5884 | struct vm_area_struct *vma = walk->private; | 5894 | struct vm_area_struct *vma = walk->private; |
5885 | pte_t *pte; | 5895 | pte_t *pte; |
5886 | spinlock_t *ptl; | 5896 | spinlock_t *ptl; |
5887 | enum mc_target_type target_type; | 5897 | enum mc_target_type target_type; |
5888 | union mc_target target; | 5898 | union mc_target target; |
5889 | struct page *page; | 5899 | struct page *page; |
5890 | struct page_cgroup *pc; | 5900 | struct page_cgroup *pc; |
5891 | 5901 | ||
5892 | /* | 5902 | /* |
5893 | * We don't take compound_lock() here but no race with splitting thp | 5903 | * We don't take compound_lock() here but no race with splitting thp |
5894 | * happens because: | 5904 | * happens because: |
5895 | * - if pmd_trans_huge_lock() returns 1, the relevant thp is not | 5905 | * - if pmd_trans_huge_lock() returns 1, the relevant thp is not |
5896 | * under splitting, which means there's no concurrent thp split, | 5906 | * under splitting, which means there's no concurrent thp split, |
5897 | * - if another thread runs into split_huge_page() just after we | 5907 | * - if another thread runs into split_huge_page() just after we |
5898 | * entered this if-block, the thread must wait for page table lock | 5908 | * entered this if-block, the thread must wait for page table lock |
5899 | * to be unlocked in __split_huge_page_splitting(), where the main | 5909 | * to be unlocked in __split_huge_page_splitting(), where the main |
5900 | * part of thp split is not executed yet. | 5910 | * part of thp split is not executed yet. |
5901 | */ | 5911 | */ |
5902 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 5912 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
5903 | if (mc.precharge < HPAGE_PMD_NR) { | 5913 | if (mc.precharge < HPAGE_PMD_NR) { |
5904 | spin_unlock(ptl); | 5914 | spin_unlock(ptl); |
5905 | return 0; | 5915 | return 0; |
5906 | } | 5916 | } |
5907 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); | 5917 | target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); |
5908 | if (target_type == MC_TARGET_PAGE) { | 5918 | if (target_type == MC_TARGET_PAGE) { |
5909 | page = target.page; | 5919 | page = target.page; |
5910 | if (!isolate_lru_page(page)) { | 5920 | if (!isolate_lru_page(page)) { |
5911 | pc = lookup_page_cgroup(page); | 5921 | pc = lookup_page_cgroup(page); |
5912 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | 5922 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, |
5913 | pc, mc.from, mc.to)) { | 5923 | pc, mc.from, mc.to)) { |
5914 | mc.precharge -= HPAGE_PMD_NR; | 5924 | mc.precharge -= HPAGE_PMD_NR; |
5915 | mc.moved_charge += HPAGE_PMD_NR; | 5925 | mc.moved_charge += HPAGE_PMD_NR; |
5916 | } | 5926 | } |
5917 | putback_lru_page(page); | 5927 | putback_lru_page(page); |
5918 | } | 5928 | } |
5919 | put_page(page); | 5929 | put_page(page); |
5920 | } | 5930 | } |
5921 | spin_unlock(ptl); | 5931 | spin_unlock(ptl); |
5922 | return 0; | 5932 | return 0; |
5923 | } | 5933 | } |
5924 | 5934 | ||
5925 | if (pmd_trans_unstable(pmd)) | 5935 | if (pmd_trans_unstable(pmd)) |
5926 | return 0; | 5936 | return 0; |
5927 | retry: | 5937 | retry: |
5928 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 5938 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
5929 | for (; addr != end; addr += PAGE_SIZE) { | 5939 | for (; addr != end; addr += PAGE_SIZE) { |
5930 | pte_t ptent = *(pte++); | 5940 | pte_t ptent = *(pte++); |
5931 | swp_entry_t ent; | 5941 | swp_entry_t ent; |
5932 | 5942 | ||
5933 | if (!mc.precharge) | 5943 | if (!mc.precharge) |
5934 | break; | 5944 | break; |
5935 | 5945 | ||
5936 | switch (get_mctgt_type(vma, addr, ptent, &target)) { | 5946 | switch (get_mctgt_type(vma, addr, ptent, &target)) { |
5937 | case MC_TARGET_PAGE: | 5947 | case MC_TARGET_PAGE: |
5938 | page = target.page; | 5948 | page = target.page; |
5939 | if (isolate_lru_page(page)) | 5949 | if (isolate_lru_page(page)) |
5940 | goto put; | 5950 | goto put; |
5941 | pc = lookup_page_cgroup(page); | 5951 | pc = lookup_page_cgroup(page); |
5942 | if (!mem_cgroup_move_account(page, 1, pc, | 5952 | if (!mem_cgroup_move_account(page, 1, pc, |
5943 | mc.from, mc.to)) { | 5953 | mc.from, mc.to)) { |
5944 | mc.precharge--; | 5954 | mc.precharge--; |
5945 | /* we uncharge from mc.from later. */ | 5955 | /* we uncharge from mc.from later. */ |
5946 | mc.moved_charge++; | 5956 | mc.moved_charge++; |
5947 | } | 5957 | } |
5948 | putback_lru_page(page); | 5958 | putback_lru_page(page); |
5949 | put: /* get_mctgt_type() gets the page */ | 5959 | put: /* get_mctgt_type() gets the page */ |
5950 | put_page(page); | 5960 | put_page(page); |
5951 | break; | 5961 | break; |
5952 | case MC_TARGET_SWAP: | 5962 | case MC_TARGET_SWAP: |
5953 | ent = target.ent; | 5963 | ent = target.ent; |
5954 | if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { | 5964 | if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { |
5955 | mc.precharge--; | 5965 | mc.precharge--; |
5956 | /* we fixup refcnts and charges later. */ | 5966 | /* we fixup refcnts and charges later. */ |
5957 | mc.moved_swap++; | 5967 | mc.moved_swap++; |
5958 | } | 5968 | } |
5959 | break; | 5969 | break; |
5960 | default: | 5970 | default: |
5961 | break; | 5971 | break; |
5962 | } | 5972 | } |
5963 | } | 5973 | } |
5964 | pte_unmap_unlock(pte - 1, ptl); | 5974 | pte_unmap_unlock(pte - 1, ptl); |
5965 | cond_resched(); | 5975 | cond_resched(); |
5966 | 5976 | ||
5967 | if (addr != end) { | 5977 | if (addr != end) { |
5968 | /* | 5978 | /* |
5969 | * We have consumed all precharges we got in can_attach(). | 5979 | * We have consumed all precharges we got in can_attach(). |
5970 | * We try charge one by one, but don't do any additional | 5980 | * We try charge one by one, but don't do any additional |
5971 | * charges to mc.to if we have failed in charge once in attach() | 5981 | * charges to mc.to if we have failed in charge once in attach() |
5972 | * phase. | 5982 | * phase. |
5973 | */ | 5983 | */ |
5974 | ret = mem_cgroup_do_precharge(1); | 5984 | ret = mem_cgroup_do_precharge(1); |
5975 | if (!ret) | 5985 | if (!ret) |
5976 | goto retry; | 5986 | goto retry; |
5977 | } | 5987 | } |
5978 | 5988 | ||
5979 | return ret; | 5989 | return ret; |
5980 | } | 5990 | } |
5981 | 5991 | ||
5982 | static void mem_cgroup_move_charge(struct mm_struct *mm) | 5992 | static void mem_cgroup_move_charge(struct mm_struct *mm) |
5983 | { | 5993 | { |
5984 | struct vm_area_struct *vma; | 5994 | struct vm_area_struct *vma; |
5985 | 5995 | ||
5986 | lru_add_drain_all(); | 5996 | lru_add_drain_all(); |
5987 | retry: | 5997 | retry: |
5988 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { | 5998 | if (unlikely(!down_read_trylock(&mm->mmap_sem))) { |
5989 | /* | 5999 | /* |
5990 | * Someone who are holding the mmap_sem might be waiting in | 6000 | * Someone who are holding the mmap_sem might be waiting in |
5991 | * waitq. So we cancel all extra charges, wake up all waiters, | 6001 | * waitq. So we cancel all extra charges, wake up all waiters, |
5992 | * and retry. Because we cancel precharges, we might not be able | 6002 | * and retry. Because we cancel precharges, we might not be able |
5993 | * to move enough charges, but moving charge is a best-effort | 6003 | * to move enough charges, but moving charge is a best-effort |
5994 | * feature anyway, so it wouldn't be a big problem. | 6004 | * feature anyway, so it wouldn't be a big problem. |
5995 | */ | 6005 | */ |
5996 | __mem_cgroup_clear_mc(); | 6006 | __mem_cgroup_clear_mc(); |
5997 | cond_resched(); | 6007 | cond_resched(); |
5998 | goto retry; | 6008 | goto retry; |
5999 | } | 6009 | } |
6000 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 6010 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
6001 | int ret; | 6011 | int ret; |
6002 | struct mm_walk mem_cgroup_move_charge_walk = { | 6012 | struct mm_walk mem_cgroup_move_charge_walk = { |
6003 | .pmd_entry = mem_cgroup_move_charge_pte_range, | 6013 | .pmd_entry = mem_cgroup_move_charge_pte_range, |
6004 | .mm = mm, | 6014 | .mm = mm, |
6005 | .private = vma, | 6015 | .private = vma, |
6006 | }; | 6016 | }; |
6007 | if (is_vm_hugetlb_page(vma)) | 6017 | if (is_vm_hugetlb_page(vma)) |
6008 | continue; | 6018 | continue; |
6009 | ret = walk_page_range(vma->vm_start, vma->vm_end, | 6019 | ret = walk_page_range(vma->vm_start, vma->vm_end, |
6010 | &mem_cgroup_move_charge_walk); | 6020 | &mem_cgroup_move_charge_walk); |
6011 | if (ret) | 6021 | if (ret) |
6012 | /* | 6022 | /* |
6013 | * means we have consumed all precharges and failed in | 6023 | * means we have consumed all precharges and failed in |
6014 | * doing additional charge. Just abandon here. | 6024 | * doing additional charge. Just abandon here. |
6015 | */ | 6025 | */ |
6016 | break; | 6026 | break; |
6017 | } | 6027 | } |
6018 | up_read(&mm->mmap_sem); | 6028 | up_read(&mm->mmap_sem); |
6019 | } | 6029 | } |
6020 | 6030 | ||
6021 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, | 6031 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, |
6022 | struct cgroup_taskset *tset) | 6032 | struct cgroup_taskset *tset) |
6023 | { | 6033 | { |
6024 | struct task_struct *p = cgroup_taskset_first(tset); | 6034 | struct task_struct *p = cgroup_taskset_first(tset); |
6025 | struct mm_struct *mm = get_task_mm(p); | 6035 | struct mm_struct *mm = get_task_mm(p); |
6026 | 6036 | ||
6027 | if (mm) { | 6037 | if (mm) { |
6028 | if (mc.to) | 6038 | if (mc.to) |
6029 | mem_cgroup_move_charge(mm); | 6039 | mem_cgroup_move_charge(mm); |
6030 | mmput(mm); | 6040 | mmput(mm); |
6031 | } | 6041 | } |
6032 | if (mc.to) | 6042 | if (mc.to) |
6033 | mem_cgroup_clear_mc(); | 6043 | mem_cgroup_clear_mc(); |
6034 | } | 6044 | } |
6035 | #else /* !CONFIG_MMU */ | 6045 | #else /* !CONFIG_MMU */ |
6036 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | 6046 | static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, |
6037 | struct cgroup_taskset *tset) | 6047 | struct cgroup_taskset *tset) |
6038 | { | 6048 | { |
6039 | return 0; | 6049 | return 0; |
6040 | } | 6050 | } |
6041 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, | 6051 | static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, |
6042 | struct cgroup_taskset *tset) | 6052 | struct cgroup_taskset *tset) |
6043 | { | 6053 | { |
6044 | } | 6054 | } |
6045 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, | 6055 | static void mem_cgroup_move_task(struct cgroup_subsys_state *css, |
6046 | struct cgroup_taskset *tset) | 6056 | struct cgroup_taskset *tset) |
6047 | { | 6057 | { |
6048 | } | 6058 | } |
6049 | #endif | 6059 | #endif |
6050 | 6060 | ||
6051 | /* | 6061 | /* |
6052 | * Cgroup retains root cgroups across [un]mount cycles making it necessary | 6062 | * Cgroup retains root cgroups across [un]mount cycles making it necessary |
6053 | * to verify whether we're attached to the default hierarchy on each mount | 6063 | * to verify whether we're attached to the default hierarchy on each mount |
6054 | * attempt. | 6064 | * attempt. |
6055 | */ | 6065 | */ |
6056 | static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | 6066 | static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) |
6057 | { | 6067 | { |
6058 | /* | 6068 | /* |
6059 | * use_hierarchy is forced on the default hierarchy. cgroup core | 6069 | * use_hierarchy is forced on the default hierarchy. cgroup core |
6060 | * guarantees that @root doesn't have any children, so turning it | 6070 | * guarantees that @root doesn't have any children, so turning it |
6061 | * on for the root memcg is enough. | 6071 | * on for the root memcg is enough. |
6062 | */ | 6072 | */ |
6063 | if (cgroup_on_dfl(root_css->cgroup)) | 6073 | if (cgroup_on_dfl(root_css->cgroup)) |
6064 | mem_cgroup_from_css(root_css)->use_hierarchy = true; | 6074 | mem_cgroup_from_css(root_css)->use_hierarchy = true; |
6065 | } | 6075 | } |
6066 | 6076 | ||
6067 | struct cgroup_subsys memory_cgrp_subsys = { | 6077 | struct cgroup_subsys memory_cgrp_subsys = { |
6068 | .css_alloc = mem_cgroup_css_alloc, | 6078 | .css_alloc = mem_cgroup_css_alloc, |
6069 | .css_online = mem_cgroup_css_online, | 6079 | .css_online = mem_cgroup_css_online, |
6070 | .css_offline = mem_cgroup_css_offline, | 6080 | .css_offline = mem_cgroup_css_offline, |
6071 | .css_free = mem_cgroup_css_free, | 6081 | .css_free = mem_cgroup_css_free, |
6072 | .css_reset = mem_cgroup_css_reset, | 6082 | .css_reset = mem_cgroup_css_reset, |
6073 | .can_attach = mem_cgroup_can_attach, | 6083 | .can_attach = mem_cgroup_can_attach, |
6074 | .cancel_attach = mem_cgroup_cancel_attach, | 6084 | .cancel_attach = mem_cgroup_cancel_attach, |
6075 | .attach = mem_cgroup_move_task, | 6085 | .attach = mem_cgroup_move_task, |
6076 | .bind = mem_cgroup_bind, | 6086 | .bind = mem_cgroup_bind, |
6077 | .legacy_cftypes = mem_cgroup_files, | 6087 | .legacy_cftypes = mem_cgroup_files, |
6078 | .early_init = 0, | 6088 | .early_init = 0, |
6079 | }; | 6089 | }; |
6080 | 6090 | ||
6081 | #ifdef CONFIG_MEMCG_SWAP | 6091 | #ifdef CONFIG_MEMCG_SWAP |
6082 | static int __init enable_swap_account(char *s) | 6092 | static int __init enable_swap_account(char *s) |
6083 | { | 6093 | { |
6084 | if (!strcmp(s, "1")) | 6094 | if (!strcmp(s, "1")) |
6085 | really_do_swap_account = 1; | 6095 | really_do_swap_account = 1; |
6086 | else if (!strcmp(s, "0")) | 6096 | else if (!strcmp(s, "0")) |
6087 | really_do_swap_account = 0; | 6097 | really_do_swap_account = 0; |
6088 | return 1; | 6098 | return 1; |
6089 | } | 6099 | } |
6090 | __setup("swapaccount=", enable_swap_account); | 6100 | __setup("swapaccount=", enable_swap_account); |
6091 | 6101 | ||
6092 | static void __init memsw_file_init(void) | 6102 | static void __init memsw_file_init(void) |
6093 | { | 6103 | { |
6094 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, | 6104 | WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, |
6095 | memsw_cgroup_files)); | 6105 | memsw_cgroup_files)); |
6096 | } | 6106 | } |
6097 | 6107 | ||
6098 | static void __init enable_swap_cgroup(void) | 6108 | static void __init enable_swap_cgroup(void) |
6099 | { | 6109 | { |
6100 | if (!mem_cgroup_disabled() && really_do_swap_account) { | 6110 | if (!mem_cgroup_disabled() && really_do_swap_account) { |
6101 | do_swap_account = 1; | 6111 | do_swap_account = 1; |
6102 | memsw_file_init(); | 6112 | memsw_file_init(); |
6103 | } | 6113 | } |
6104 | } | 6114 | } |
6105 | 6115 | ||
6106 | #else | 6116 | #else |
6107 | static void __init enable_swap_cgroup(void) | 6117 | static void __init enable_swap_cgroup(void) |
6108 | { | 6118 | { |
6109 | } | 6119 | } |
6110 | #endif | 6120 | #endif |
6111 | 6121 | ||
6112 | #ifdef CONFIG_MEMCG_SWAP | 6122 | #ifdef CONFIG_MEMCG_SWAP |
6113 | /** | 6123 | /** |
6114 | * mem_cgroup_swapout - transfer a memsw charge to swap | 6124 | * mem_cgroup_swapout - transfer a memsw charge to swap |
6115 | * @page: page whose memsw charge to transfer | 6125 | * @page: page whose memsw charge to transfer |
6116 | * @entry: swap entry to move the charge to | 6126 | * @entry: swap entry to move the charge to |
6117 | * | 6127 | * |
6118 | * Transfer the memsw charge of @page to @entry. | 6128 | * Transfer the memsw charge of @page to @entry. |
6119 | */ | 6129 | */ |
6120 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | 6130 | void mem_cgroup_swapout(struct page *page, swp_entry_t entry) |
6121 | { | 6131 | { |
6122 | struct page_cgroup *pc; | 6132 | struct page_cgroup *pc; |
6123 | unsigned short oldid; | 6133 | unsigned short oldid; |
6124 | 6134 | ||
6125 | VM_BUG_ON_PAGE(PageLRU(page), page); | 6135 | VM_BUG_ON_PAGE(PageLRU(page), page); |
6126 | VM_BUG_ON_PAGE(page_count(page), page); | 6136 | VM_BUG_ON_PAGE(page_count(page), page); |
6127 | 6137 | ||
6128 | if (!do_swap_account) | 6138 | if (!do_swap_account) |
6129 | return; | 6139 | return; |
6130 | 6140 | ||
6131 | pc = lookup_page_cgroup(page); | 6141 | pc = lookup_page_cgroup(page); |
6132 | 6142 | ||
6133 | /* Readahead page, never charged */ | 6143 | /* Readahead page, never charged */ |
6134 | if (!PageCgroupUsed(pc)) | 6144 | if (!PageCgroupUsed(pc)) |
6135 | return; | 6145 | return; |
6136 | 6146 | ||
6137 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); | 6147 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); |
6138 | 6148 | ||
6139 | oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); | 6149 | oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); |
6140 | VM_BUG_ON_PAGE(oldid, page); | 6150 | VM_BUG_ON_PAGE(oldid, page); |
6141 | 6151 | ||
6142 | pc->flags &= ~PCG_MEMSW; | 6152 | pc->flags &= ~PCG_MEMSW; |
6143 | css_get(&pc->mem_cgroup->css); | 6153 | css_get(&pc->mem_cgroup->css); |
6144 | mem_cgroup_swap_statistics(pc->mem_cgroup, true); | 6154 | mem_cgroup_swap_statistics(pc->mem_cgroup, true); |
6145 | } | 6155 | } |
6146 | 6156 | ||
6147 | /** | 6157 | /** |
6148 | * mem_cgroup_uncharge_swap - uncharge a swap entry | 6158 | * mem_cgroup_uncharge_swap - uncharge a swap entry |
6149 | * @entry: swap entry to uncharge | 6159 | * @entry: swap entry to uncharge |
6150 | * | 6160 | * |
6151 | * Drop the memsw charge associated with @entry. | 6161 | * Drop the memsw charge associated with @entry. |
6152 | */ | 6162 | */ |
6153 | void mem_cgroup_uncharge_swap(swp_entry_t entry) | 6163 | void mem_cgroup_uncharge_swap(swp_entry_t entry) |
6154 | { | 6164 | { |
6155 | struct mem_cgroup *memcg; | 6165 | struct mem_cgroup *memcg; |
6156 | unsigned short id; | 6166 | unsigned short id; |
6157 | 6167 | ||
6158 | if (!do_swap_account) | 6168 | if (!do_swap_account) |
6159 | return; | 6169 | return; |
6160 | 6170 | ||
6161 | id = swap_cgroup_record(entry, 0); | 6171 | id = swap_cgroup_record(entry, 0); |
6162 | rcu_read_lock(); | 6172 | rcu_read_lock(); |
6163 | memcg = mem_cgroup_lookup(id); | 6173 | memcg = mem_cgroup_lookup(id); |
6164 | if (memcg) { | 6174 | if (memcg) { |
6165 | if (!mem_cgroup_is_root(memcg)) | 6175 | if (!mem_cgroup_is_root(memcg)) |
6166 | page_counter_uncharge(&memcg->memsw, 1); | 6176 | page_counter_uncharge(&memcg->memsw, 1); |
6167 | mem_cgroup_swap_statistics(memcg, false); | 6177 | mem_cgroup_swap_statistics(memcg, false); |
6168 | css_put(&memcg->css); | 6178 | css_put(&memcg->css); |
6169 | } | 6179 | } |
6170 | rcu_read_unlock(); | 6180 | rcu_read_unlock(); |
6171 | } | 6181 | } |
6172 | #endif | 6182 | #endif |
6173 | 6183 | ||
6174 | /** | 6184 | /** |
6175 | * mem_cgroup_try_charge - try charging a page | 6185 | * mem_cgroup_try_charge - try charging a page |
6176 | * @page: page to charge | 6186 | * @page: page to charge |
6177 | * @mm: mm context of the victim | 6187 | * @mm: mm context of the victim |
6178 | * @gfp_mask: reclaim mode | 6188 | * @gfp_mask: reclaim mode |
6179 | * @memcgp: charged memcg return | 6189 | * @memcgp: charged memcg return |
6180 | * | 6190 | * |
6181 | * Try to charge @page to the memcg that @mm belongs to, reclaiming | 6191 | * Try to charge @page to the memcg that @mm belongs to, reclaiming |
6182 | * pages according to @gfp_mask if necessary. | 6192 | * pages according to @gfp_mask if necessary. |
6183 | * | 6193 | * |
6184 | * Returns 0 on success, with *@memcgp pointing to the charged memcg. | 6194 | * Returns 0 on success, with *@memcgp pointing to the charged memcg. |
6185 | * Otherwise, an error code is returned. | 6195 | * Otherwise, an error code is returned. |
6186 | * | 6196 | * |
6187 | * After page->mapping has been set up, the caller must finalize the | 6197 | * After page->mapping has been set up, the caller must finalize the |
6188 | * charge with mem_cgroup_commit_charge(). Or abort the transaction | 6198 | * charge with mem_cgroup_commit_charge(). Or abort the transaction |
6189 | * with mem_cgroup_cancel_charge() in case page instantiation fails. | 6199 | * with mem_cgroup_cancel_charge() in case page instantiation fails. |
6190 | */ | 6200 | */ |
6191 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 6201 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, |
6192 | gfp_t gfp_mask, struct mem_cgroup **memcgp) | 6202 | gfp_t gfp_mask, struct mem_cgroup **memcgp) |
6193 | { | 6203 | { |
6194 | struct mem_cgroup *memcg = NULL; | 6204 | struct mem_cgroup *memcg = NULL; |
6195 | unsigned int nr_pages = 1; | 6205 | unsigned int nr_pages = 1; |
6196 | int ret = 0; | 6206 | int ret = 0; |
6197 | 6207 | ||
6198 | if (mem_cgroup_disabled()) | 6208 | if (mem_cgroup_disabled()) |
6199 | goto out; | 6209 | goto out; |
6200 | 6210 | ||
6201 | if (PageSwapCache(page)) { | 6211 | if (PageSwapCache(page)) { |
6202 | struct page_cgroup *pc = lookup_page_cgroup(page); | 6212 | struct page_cgroup *pc = lookup_page_cgroup(page); |
6203 | /* | 6213 | /* |
6204 | * Every swap fault against a single page tries to charge the | 6214 | * Every swap fault against a single page tries to charge the |
6205 | * page, bail as early as possible. shmem_unuse() encounters | 6215 | * page, bail as early as possible. shmem_unuse() encounters |
6206 | * already charged pages, too. The USED bit is protected by | 6216 | * already charged pages, too. The USED bit is protected by |
6207 | * the page lock, which serializes swap cache removal, which | 6217 | * the page lock, which serializes swap cache removal, which |
6208 | * in turn serializes uncharging. | 6218 | * in turn serializes uncharging. |
6209 | */ | 6219 | */ |
6210 | if (PageCgroupUsed(pc)) | 6220 | if (PageCgroupUsed(pc)) |
6211 | goto out; | 6221 | goto out; |
6212 | } | 6222 | } |
6213 | 6223 | ||
6214 | if (PageTransHuge(page)) { | 6224 | if (PageTransHuge(page)) { |
6215 | nr_pages <<= compound_order(page); | 6225 | nr_pages <<= compound_order(page); |
6216 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 6226 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
6217 | } | 6227 | } |
6218 | 6228 | ||
6219 | if (do_swap_account && PageSwapCache(page)) | 6229 | if (do_swap_account && PageSwapCache(page)) |
6220 | memcg = try_get_mem_cgroup_from_page(page); | 6230 | memcg = try_get_mem_cgroup_from_page(page); |
6221 | if (!memcg) | 6231 | if (!memcg) |
6222 | memcg = get_mem_cgroup_from_mm(mm); | 6232 | memcg = get_mem_cgroup_from_mm(mm); |
6223 | 6233 | ||
6224 | ret = try_charge(memcg, gfp_mask, nr_pages); | 6234 | ret = try_charge(memcg, gfp_mask, nr_pages); |
6225 | 6235 | ||
6226 | css_put(&memcg->css); | 6236 | css_put(&memcg->css); |
6227 | 6237 | ||
6228 | if (ret == -EINTR) { | 6238 | if (ret == -EINTR) { |
6229 | memcg = root_mem_cgroup; | 6239 | memcg = root_mem_cgroup; |
6230 | ret = 0; | 6240 | ret = 0; |
6231 | } | 6241 | } |
6232 | out: | 6242 | out: |
6233 | *memcgp = memcg; | 6243 | *memcgp = memcg; |
6234 | return ret; | 6244 | return ret; |
6235 | } | 6245 | } |
6236 | 6246 | ||
6237 | /** | 6247 | /** |
6238 | * mem_cgroup_commit_charge - commit a page charge | 6248 | * mem_cgroup_commit_charge - commit a page charge |
6239 | * @page: page to charge | 6249 | * @page: page to charge |
6240 | * @memcg: memcg to charge the page to | 6250 | * @memcg: memcg to charge the page to |
6241 | * @lrucare: page might be on LRU already | 6251 | * @lrucare: page might be on LRU already |
6242 | * | 6252 | * |
6243 | * Finalize a charge transaction started by mem_cgroup_try_charge(), | 6253 | * Finalize a charge transaction started by mem_cgroup_try_charge(), |
6244 | * after page->mapping has been set up. This must happen atomically | 6254 | * after page->mapping has been set up. This must happen atomically |
6245 | * as part of the page instantiation, i.e. under the page table lock | 6255 | * as part of the page instantiation, i.e. under the page table lock |
6246 | * for anonymous pages, under the page lock for page and swap cache. | 6256 | * for anonymous pages, under the page lock for page and swap cache. |
6247 | * | 6257 | * |
6248 | * In addition, the page must not be on the LRU during the commit, to | 6258 | * In addition, the page must not be on the LRU during the commit, to |
6249 | * prevent racing with task migration. If it might be, use @lrucare. | 6259 | * prevent racing with task migration. If it might be, use @lrucare. |
6250 | * | 6260 | * |
6251 | * Use mem_cgroup_cancel_charge() to cancel the transaction instead. | 6261 | * Use mem_cgroup_cancel_charge() to cancel the transaction instead. |
6252 | */ | 6262 | */ |
6253 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | 6263 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, |
6254 | bool lrucare) | 6264 | bool lrucare) |
6255 | { | 6265 | { |
6256 | unsigned int nr_pages = 1; | 6266 | unsigned int nr_pages = 1; |
6257 | 6267 | ||
6258 | VM_BUG_ON_PAGE(!page->mapping, page); | 6268 | VM_BUG_ON_PAGE(!page->mapping, page); |
6259 | VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); | 6269 | VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); |
6260 | 6270 | ||
6261 | if (mem_cgroup_disabled()) | 6271 | if (mem_cgroup_disabled()) |
6262 | return; | 6272 | return; |
6263 | /* | 6273 | /* |
6264 | * Swap faults will attempt to charge the same page multiple | 6274 | * Swap faults will attempt to charge the same page multiple |
6265 | * times. But reuse_swap_page() might have removed the page | 6275 | * times. But reuse_swap_page() might have removed the page |
6266 | * from swapcache already, so we can't check PageSwapCache(). | 6276 | * from swapcache already, so we can't check PageSwapCache(). |
6267 | */ | 6277 | */ |
6268 | if (!memcg) | 6278 | if (!memcg) |
6269 | return; | 6279 | return; |
6270 | 6280 | ||
6271 | commit_charge(page, memcg, lrucare); | 6281 | commit_charge(page, memcg, lrucare); |
6272 | 6282 | ||
6273 | if (PageTransHuge(page)) { | 6283 | if (PageTransHuge(page)) { |
6274 | nr_pages <<= compound_order(page); | 6284 | nr_pages <<= compound_order(page); |
6275 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 6285 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
6276 | } | 6286 | } |
6277 | 6287 | ||
6278 | local_irq_disable(); | 6288 | local_irq_disable(); |
6279 | mem_cgroup_charge_statistics(memcg, page, nr_pages); | 6289 | mem_cgroup_charge_statistics(memcg, page, nr_pages); |
6280 | memcg_check_events(memcg, page); | 6290 | memcg_check_events(memcg, page); |
6281 | local_irq_enable(); | 6291 | local_irq_enable(); |
6282 | 6292 | ||
6283 | if (do_swap_account && PageSwapCache(page)) { | 6293 | if (do_swap_account && PageSwapCache(page)) { |
6284 | swp_entry_t entry = { .val = page_private(page) }; | 6294 | swp_entry_t entry = { .val = page_private(page) }; |
6285 | /* | 6295 | /* |
6286 | * The swap entry might not get freed for a long time, | 6296 | * The swap entry might not get freed for a long time, |
6287 | * let's not wait for it. The page already received a | 6297 | * let's not wait for it. The page already received a |
6288 | * memory+swap charge, drop the swap entry duplicate. | 6298 | * memory+swap charge, drop the swap entry duplicate. |
6289 | */ | 6299 | */ |
6290 | mem_cgroup_uncharge_swap(entry); | 6300 | mem_cgroup_uncharge_swap(entry); |
6291 | } | 6301 | } |
6292 | } | 6302 | } |
6293 | 6303 | ||
6294 | /** | 6304 | /** |
6295 | * mem_cgroup_cancel_charge - cancel a page charge | 6305 | * mem_cgroup_cancel_charge - cancel a page charge |
6296 | * @page: page to charge | 6306 | * @page: page to charge |
6297 | * @memcg: memcg to charge the page to | 6307 | * @memcg: memcg to charge the page to |
6298 | * | 6308 | * |
6299 | * Cancel a charge transaction started by mem_cgroup_try_charge(). | 6309 | * Cancel a charge transaction started by mem_cgroup_try_charge(). |
6300 | */ | 6310 | */ |
6301 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) | 6311 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) |
6302 | { | 6312 | { |
6303 | unsigned int nr_pages = 1; | 6313 | unsigned int nr_pages = 1; |
6304 | 6314 | ||
6305 | if (mem_cgroup_disabled()) | 6315 | if (mem_cgroup_disabled()) |
6306 | return; | 6316 | return; |
6307 | /* | 6317 | /* |
6308 | * Swap faults will attempt to charge the same page multiple | 6318 | * Swap faults will attempt to charge the same page multiple |
6309 | * times. But reuse_swap_page() might have removed the page | 6319 | * times. But reuse_swap_page() might have removed the page |
6310 | * from swapcache already, so we can't check PageSwapCache(). | 6320 | * from swapcache already, so we can't check PageSwapCache(). |
6311 | */ | 6321 | */ |
6312 | if (!memcg) | 6322 | if (!memcg) |
6313 | return; | 6323 | return; |
6314 | 6324 | ||
6315 | if (PageTransHuge(page)) { | 6325 | if (PageTransHuge(page)) { |
6316 | nr_pages <<= compound_order(page); | 6326 | nr_pages <<= compound_order(page); |
6317 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 6327 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
6318 | } | 6328 | } |
6319 | 6329 | ||
6320 | cancel_charge(memcg, nr_pages); | 6330 | cancel_charge(memcg, nr_pages); |
6321 | } | 6331 | } |
6322 | 6332 | ||
6323 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, | 6333 | static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, |
6324 | unsigned long nr_mem, unsigned long nr_memsw, | 6334 | unsigned long nr_mem, unsigned long nr_memsw, |
6325 | unsigned long nr_anon, unsigned long nr_file, | 6335 | unsigned long nr_anon, unsigned long nr_file, |
6326 | unsigned long nr_huge, struct page *dummy_page) | 6336 | unsigned long nr_huge, struct page *dummy_page) |
6327 | { | 6337 | { |
6328 | unsigned long flags; | 6338 | unsigned long flags; |
6329 | 6339 | ||
6330 | if (!mem_cgroup_is_root(memcg)) { | 6340 | if (!mem_cgroup_is_root(memcg)) { |
6331 | if (nr_mem) | 6341 | if (nr_mem) |
6332 | page_counter_uncharge(&memcg->memory, nr_mem); | 6342 | page_counter_uncharge(&memcg->memory, nr_mem); |
6333 | if (nr_memsw) | 6343 | if (nr_memsw) |
6334 | page_counter_uncharge(&memcg->memsw, nr_memsw); | 6344 | page_counter_uncharge(&memcg->memsw, nr_memsw); |
6335 | memcg_oom_recover(memcg); | 6345 | memcg_oom_recover(memcg); |
6336 | } | 6346 | } |
6337 | 6347 | ||
6338 | local_irq_save(flags); | 6348 | local_irq_save(flags); |
6339 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); | 6349 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); |
6340 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); | 6350 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); |
6341 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); | 6351 | __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); |
6342 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); | 6352 | __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); |
6343 | __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); | 6353 | __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); |
6344 | memcg_check_events(memcg, dummy_page); | 6354 | memcg_check_events(memcg, dummy_page); |
6345 | local_irq_restore(flags); | 6355 | local_irq_restore(flags); |
6356 | |||
6357 | if (!mem_cgroup_is_root(memcg)) | ||
6358 | css_put_many(&memcg->css, max(nr_mem, nr_memsw)); | ||
6346 | } | 6359 | } |
6347 | 6360 | ||
6348 | static void uncharge_list(struct list_head *page_list) | 6361 | static void uncharge_list(struct list_head *page_list) |
6349 | { | 6362 | { |
6350 | struct mem_cgroup *memcg = NULL; | 6363 | struct mem_cgroup *memcg = NULL; |
6351 | unsigned long nr_memsw = 0; | 6364 | unsigned long nr_memsw = 0; |
6352 | unsigned long nr_anon = 0; | 6365 | unsigned long nr_anon = 0; |
6353 | unsigned long nr_file = 0; | 6366 | unsigned long nr_file = 0; |
6354 | unsigned long nr_huge = 0; | 6367 | unsigned long nr_huge = 0; |
6355 | unsigned long pgpgout = 0; | 6368 | unsigned long pgpgout = 0; |
6356 | unsigned long nr_mem = 0; | 6369 | unsigned long nr_mem = 0; |
6357 | struct list_head *next; | 6370 | struct list_head *next; |
6358 | struct page *page; | 6371 | struct page *page; |
6359 | 6372 | ||
6360 | next = page_list->next; | 6373 | next = page_list->next; |
6361 | do { | 6374 | do { |
6362 | unsigned int nr_pages = 1; | 6375 | unsigned int nr_pages = 1; |
6363 | struct page_cgroup *pc; | 6376 | struct page_cgroup *pc; |
6364 | 6377 | ||
6365 | page = list_entry(next, struct page, lru); | 6378 | page = list_entry(next, struct page, lru); |
6366 | next = page->lru.next; | 6379 | next = page->lru.next; |
6367 | 6380 | ||
6368 | VM_BUG_ON_PAGE(PageLRU(page), page); | 6381 | VM_BUG_ON_PAGE(PageLRU(page), page); |
6369 | VM_BUG_ON_PAGE(page_count(page), page); | 6382 | VM_BUG_ON_PAGE(page_count(page), page); |
6370 | 6383 | ||
6371 | pc = lookup_page_cgroup(page); | 6384 | pc = lookup_page_cgroup(page); |
6372 | if (!PageCgroupUsed(pc)) | 6385 | if (!PageCgroupUsed(pc)) |
6373 | continue; | 6386 | continue; |
6374 | 6387 | ||
6375 | /* | 6388 | /* |
6376 | * Nobody should be changing or seriously looking at | 6389 | * Nobody should be changing or seriously looking at |
6377 | * pc->mem_cgroup and pc->flags at this point, we have | 6390 | * pc->mem_cgroup and pc->flags at this point, we have |
6378 | * fully exclusive access to the page. | 6391 | * fully exclusive access to the page. |
6379 | */ | 6392 | */ |
6380 | 6393 | ||
6381 | if (memcg != pc->mem_cgroup) { | 6394 | if (memcg != pc->mem_cgroup) { |
6382 | if (memcg) { | 6395 | if (memcg) { |
6383 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, | 6396 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, |
6384 | nr_anon, nr_file, nr_huge, page); | 6397 | nr_anon, nr_file, nr_huge, page); |
6385 | pgpgout = nr_mem = nr_memsw = 0; | 6398 | pgpgout = nr_mem = nr_memsw = 0; |
6386 | nr_anon = nr_file = nr_huge = 0; | 6399 | nr_anon = nr_file = nr_huge = 0; |
6387 | } | 6400 | } |
6388 | memcg = pc->mem_cgroup; | 6401 | memcg = pc->mem_cgroup; |
6389 | } | 6402 | } |
6390 | 6403 | ||
6391 | if (PageTransHuge(page)) { | 6404 | if (PageTransHuge(page)) { |
6392 | nr_pages <<= compound_order(page); | 6405 | nr_pages <<= compound_order(page); |
6393 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | 6406 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
6394 | nr_huge += nr_pages; | 6407 | nr_huge += nr_pages; |
6395 | } | 6408 | } |
6396 | 6409 | ||
6397 | if (PageAnon(page)) | 6410 | if (PageAnon(page)) |
6398 | nr_anon += nr_pages; | 6411 | nr_anon += nr_pages; |
6399 | else | 6412 | else |
6400 | nr_file += nr_pages; | 6413 | nr_file += nr_pages; |
6401 | 6414 | ||
6402 | if (pc->flags & PCG_MEM) | 6415 | if (pc->flags & PCG_MEM) |
6403 | nr_mem += nr_pages; | 6416 | nr_mem += nr_pages; |
6404 | if (pc->flags & PCG_MEMSW) | 6417 | if (pc->flags & PCG_MEMSW) |
6405 | nr_memsw += nr_pages; | 6418 | nr_memsw += nr_pages; |
6406 | pc->flags = 0; | 6419 | pc->flags = 0; |
6407 | 6420 | ||
6408 | pgpgout++; | 6421 | pgpgout++; |
6409 | } while (next != page_list); | 6422 | } while (next != page_list); |
6410 | 6423 | ||
6411 | if (memcg) | 6424 | if (memcg) |
6412 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, | 6425 | uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, |
6413 | nr_anon, nr_file, nr_huge, page); | 6426 | nr_anon, nr_file, nr_huge, page); |
6414 | } | 6427 | } |
6415 | 6428 | ||
6416 | /** | 6429 | /** |
6417 | * mem_cgroup_uncharge - uncharge a page | 6430 | * mem_cgroup_uncharge - uncharge a page |
6418 | * @page: page to uncharge | 6431 | * @page: page to uncharge |
6419 | * | 6432 | * |
6420 | * Uncharge a page previously charged with mem_cgroup_try_charge() and | 6433 | * Uncharge a page previously charged with mem_cgroup_try_charge() and |
6421 | * mem_cgroup_commit_charge(). | 6434 | * mem_cgroup_commit_charge(). |
6422 | */ | 6435 | */ |
6423 | void mem_cgroup_uncharge(struct page *page) | 6436 | void mem_cgroup_uncharge(struct page *page) |
6424 | { | 6437 | { |
6425 | struct page_cgroup *pc; | 6438 | struct page_cgroup *pc; |
6426 | 6439 | ||
6427 | if (mem_cgroup_disabled()) | 6440 | if (mem_cgroup_disabled()) |
6428 | return; | 6441 | return; |
6429 | 6442 | ||
6430 | /* Don't touch page->lru of any random page, pre-check: */ | 6443 | /* Don't touch page->lru of any random page, pre-check: */ |
6431 | pc = lookup_page_cgroup(page); | 6444 | pc = lookup_page_cgroup(page); |
6432 | if (!PageCgroupUsed(pc)) | 6445 | if (!PageCgroupUsed(pc)) |
6433 | return; | 6446 | return; |
6434 | 6447 | ||
6435 | INIT_LIST_HEAD(&page->lru); | 6448 | INIT_LIST_HEAD(&page->lru); |
6436 | uncharge_list(&page->lru); | 6449 | uncharge_list(&page->lru); |
6437 | } | 6450 | } |
6438 | 6451 | ||
6439 | /** | 6452 | /** |
6440 | * mem_cgroup_uncharge_list - uncharge a list of page | 6453 | * mem_cgroup_uncharge_list - uncharge a list of page |
6441 | * @page_list: list of pages to uncharge | 6454 | * @page_list: list of pages to uncharge |
6442 | * | 6455 | * |
6443 | * Uncharge a list of pages previously charged with | 6456 | * Uncharge a list of pages previously charged with |
6444 | * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). | 6457 | * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). |
6445 | */ | 6458 | */ |
6446 | void mem_cgroup_uncharge_list(struct list_head *page_list) | 6459 | void mem_cgroup_uncharge_list(struct list_head *page_list) |
6447 | { | 6460 | { |
6448 | if (mem_cgroup_disabled()) | 6461 | if (mem_cgroup_disabled()) |
6449 | return; | 6462 | return; |
6450 | 6463 | ||
6451 | if (!list_empty(page_list)) | 6464 | if (!list_empty(page_list)) |
6452 | uncharge_list(page_list); | 6465 | uncharge_list(page_list); |
6453 | } | 6466 | } |
6454 | 6467 | ||
6455 | /** | 6468 | /** |
6456 | * mem_cgroup_migrate - migrate a charge to another page | 6469 | * mem_cgroup_migrate - migrate a charge to another page |
6457 | * @oldpage: currently charged page | 6470 | * @oldpage: currently charged page |
6458 | * @newpage: page to transfer the charge to | 6471 | * @newpage: page to transfer the charge to |
6459 | * @lrucare: both pages might be on the LRU already | 6472 | * @lrucare: both pages might be on the LRU already |
6460 | * | 6473 | * |
6461 | * Migrate the charge from @oldpage to @newpage. | 6474 | * Migrate the charge from @oldpage to @newpage. |
6462 | * | 6475 | * |
6463 | * Both pages must be locked, @newpage->mapping must be set up. | 6476 | * Both pages must be locked, @newpage->mapping must be set up. |
6464 | */ | 6477 | */ |
6465 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, | 6478 | void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, |
6466 | bool lrucare) | 6479 | bool lrucare) |
6467 | { | 6480 | { |
6468 | struct page_cgroup *pc; | 6481 | struct page_cgroup *pc; |
6469 | int isolated; | 6482 | int isolated; |
6470 | 6483 | ||
6471 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); | 6484 | VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); |
6472 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); | 6485 | VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); |
6473 | VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); | 6486 | VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage); |
6474 | VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); | 6487 | VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage); |
6475 | VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); | 6488 | VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); |
6476 | VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), | 6489 | VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage), |
6477 | newpage); | 6490 | newpage); |
6478 | 6491 | ||
6479 | if (mem_cgroup_disabled()) | 6492 | if (mem_cgroup_disabled()) |
6480 | return; | 6493 | return; |
6481 | 6494 | ||
6482 | /* Page cache replacement: new page already charged? */ | 6495 | /* Page cache replacement: new page already charged? */ |
6483 | pc = lookup_page_cgroup(newpage); | 6496 | pc = lookup_page_cgroup(newpage); |
6484 | if (PageCgroupUsed(pc)) | 6497 | if (PageCgroupUsed(pc)) |
6485 | return; | 6498 | return; |
6486 | 6499 | ||
6487 | /* Re-entrant migration: old page already uncharged? */ | 6500 | /* Re-entrant migration: old page already uncharged? */ |
6488 | pc = lookup_page_cgroup(oldpage); | 6501 | pc = lookup_page_cgroup(oldpage); |
6489 | if (!PageCgroupUsed(pc)) | 6502 | if (!PageCgroupUsed(pc)) |
6490 | return; | 6503 | return; |
6491 | 6504 | ||
6492 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); | 6505 | VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage); |
6493 | VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); | 6506 | VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); |
6494 | 6507 | ||
6495 | if (lrucare) | 6508 | if (lrucare) |
6496 | lock_page_lru(oldpage, &isolated); | 6509 | lock_page_lru(oldpage, &isolated); |
6497 | 6510 | ||
6498 | pc->flags = 0; | 6511 | pc->flags = 0; |
6499 | 6512 | ||
6500 | if (lrucare) | 6513 | if (lrucare) |
6501 | unlock_page_lru(oldpage, isolated); | 6514 | unlock_page_lru(oldpage, isolated); |
6502 | 6515 | ||
6503 | commit_charge(newpage, pc->mem_cgroup, lrucare); | 6516 | commit_charge(newpage, pc->mem_cgroup, lrucare); |
6504 | } | 6517 | } |
6505 | 6518 | ||
6506 | /* | 6519 | /* |
6507 | * subsys_initcall() for memory controller. | 6520 | * subsys_initcall() for memory controller. |
6508 | * | 6521 | * |
6509 | * Some parts like hotcpu_notifier() have to be initialized from this context | 6522 | * Some parts like hotcpu_notifier() have to be initialized from this context |
6510 | * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically | 6523 | * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically |
6511 | * everything that doesn't depend on a specific mem_cgroup structure should | 6524 | * everything that doesn't depend on a specific mem_cgroup structure should |
6512 | * be initialized from here. | 6525 | * be initialized from here. |
6513 | */ | 6526 | */ |
6514 | static int __init mem_cgroup_init(void) | 6527 | static int __init mem_cgroup_init(void) |
6515 | { | 6528 | { |
6516 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); | 6529 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
6517 | enable_swap_cgroup(); | 6530 | enable_swap_cgroup(); |
6518 | mem_cgroup_soft_limit_tree_init(); | 6531 | mem_cgroup_soft_limit_tree_init(); |
6519 | memcg_stock_init(); | 6532 | memcg_stock_init(); |
6520 | return 0; | 6533 | return 0; |
6521 | } | 6534 | } |
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1
-
mentioned in commit 4bdfc1