Commit 794b1248be4e7e157f5535c3ee49168aa4643349

Authored by Vladimir Davydov
Committed by Linus Torvalds
1 parent 5722d094ad

memcg, slab: separate memcg vs root cache creation paths

Memcg-awareness turned kmem_cache_create() into a dirty interweaving of
memcg-only and except-for-memcg calls.  To clean this up, let's move the
code responsible for memcg cache creation to a separate function.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Glauber Costa <glommer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 4 changed files with 111 additions and 95 deletions Inline Diff

include/linux/memcontrol.h
1 /* memcontrol.h - Memory Controller 1 /* memcontrol.h - Memory Controller
2 * 2 *
3 * Copyright IBM Corporation, 2007 3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 * 5 *
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by 10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or 11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version. 12 * (at your option) any later version.
13 * 13 *
14 * This program is distributed in the hope that it will be useful, 14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details. 17 * GNU General Public License for more details.
18 */ 18 */
19 19
20 #ifndef _LINUX_MEMCONTROL_H 20 #ifndef _LINUX_MEMCONTROL_H
21 #define _LINUX_MEMCONTROL_H 21 #define _LINUX_MEMCONTROL_H
22 #include <linux/cgroup.h> 22 #include <linux/cgroup.h>
23 #include <linux/vm_event_item.h> 23 #include <linux/vm_event_item.h>
24 #include <linux/hardirq.h> 24 #include <linux/hardirq.h>
25 #include <linux/jump_label.h> 25 #include <linux/jump_label.h>
26 26
27 struct mem_cgroup; 27 struct mem_cgroup;
28 struct page_cgroup; 28 struct page_cgroup;
29 struct page; 29 struct page;
30 struct mm_struct; 30 struct mm_struct;
31 struct kmem_cache; 31 struct kmem_cache;
32 32
33 /* 33 /*
34 * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c, 34 * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c,
35 * These two lists should keep in accord with each other. 35 * These two lists should keep in accord with each other.
36 */ 36 */
37 enum mem_cgroup_stat_index { 37 enum mem_cgroup_stat_index {
38 /* 38 /*
39 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 39 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
40 */ 40 */
41 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 41 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
42 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 42 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
43 MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ 43 MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */
44 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 44 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
45 MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ 45 MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */
46 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ 46 MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
47 MEM_CGROUP_STAT_NSTATS, 47 MEM_CGROUP_STAT_NSTATS,
48 }; 48 };
49 49
50 struct mem_cgroup_reclaim_cookie { 50 struct mem_cgroup_reclaim_cookie {
51 struct zone *zone; 51 struct zone *zone;
52 int priority; 52 int priority;
53 unsigned int generation; 53 unsigned int generation;
54 }; 54 };
55 55
56 #ifdef CONFIG_MEMCG 56 #ifdef CONFIG_MEMCG
57 /* 57 /*
58 * All "charge" functions with gfp_mask should use GFP_KERNEL or 58 * All "charge" functions with gfp_mask should use GFP_KERNEL or
59 * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't 59 * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
60 * alloc memory but reclaims memory from all available zones. So, "where I want 60 * alloc memory but reclaims memory from all available zones. So, "where I want
61 * memory from" bits of gfp_mask has no meaning. So any bits of that field is 61 * memory from" bits of gfp_mask has no meaning. So any bits of that field is
62 * available but adding a rule is better. charge functions' gfp_mask should 62 * available but adding a rule is better. charge functions' gfp_mask should
63 * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous 63 * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
64 * codes. 64 * codes.
65 * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) 65 * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
66 */ 66 */
67 67
68 extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, 68 extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm,
69 gfp_t gfp_mask); 69 gfp_t gfp_mask);
70 /* for swap handling */ 70 /* for swap handling */
71 extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 71 extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
72 struct page *page, gfp_t mask, struct mem_cgroup **memcgp); 72 struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
73 extern void mem_cgroup_commit_charge_swapin(struct page *page, 73 extern void mem_cgroup_commit_charge_swapin(struct page *page,
74 struct mem_cgroup *memcg); 74 struct mem_cgroup *memcg);
75 extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); 75 extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
76 76
77 extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, 77 extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
78 gfp_t gfp_mask); 78 gfp_t gfp_mask);
79 79
80 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); 80 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
81 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); 81 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
82 82
83 /* For coalescing uncharge for reducing memcg' overhead*/ 83 /* For coalescing uncharge for reducing memcg' overhead*/
84 extern void mem_cgroup_uncharge_start(void); 84 extern void mem_cgroup_uncharge_start(void);
85 extern void mem_cgroup_uncharge_end(void); 85 extern void mem_cgroup_uncharge_end(void);
86 86
87 extern void mem_cgroup_uncharge_page(struct page *page); 87 extern void mem_cgroup_uncharge_page(struct page *page);
88 extern void mem_cgroup_uncharge_cache_page(struct page *page); 88 extern void mem_cgroup_uncharge_cache_page(struct page *page);
89 89
90 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 90 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
91 struct mem_cgroup *memcg); 91 struct mem_cgroup *memcg);
92 bool task_in_mem_cgroup(struct task_struct *task, 92 bool task_in_mem_cgroup(struct task_struct *task,
93 const struct mem_cgroup *memcg); 93 const struct mem_cgroup *memcg);
94 94
95 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); 95 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
96 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); 96 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
97 97
98 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); 98 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
99 extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); 99 extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css);
100 100
101 static inline 101 static inline
102 bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) 102 bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
103 { 103 {
104 struct mem_cgroup *task_memcg; 104 struct mem_cgroup *task_memcg;
105 bool match; 105 bool match;
106 106
107 rcu_read_lock(); 107 rcu_read_lock();
108 task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 108 task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
109 match = __mem_cgroup_same_or_subtree(memcg, task_memcg); 109 match = __mem_cgroup_same_or_subtree(memcg, task_memcg);
110 rcu_read_unlock(); 110 rcu_read_unlock();
111 return match; 111 return match;
112 } 112 }
113 113
114 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); 114 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
115 115
116 extern void 116 extern void
117 mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 117 mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
118 struct mem_cgroup **memcgp); 118 struct mem_cgroup **memcgp);
119 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, 119 extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
120 struct page *oldpage, struct page *newpage, bool migration_ok); 120 struct page *oldpage, struct page *newpage, bool migration_ok);
121 121
122 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, 122 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
123 struct mem_cgroup *, 123 struct mem_cgroup *,
124 struct mem_cgroup_reclaim_cookie *); 124 struct mem_cgroup_reclaim_cookie *);
125 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); 125 void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
126 126
127 /* 127 /*
128 * For memory reclaim. 128 * For memory reclaim.
129 */ 129 */
130 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec); 130 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
131 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); 131 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
132 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); 132 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
133 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); 133 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
134 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, 134 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
135 struct task_struct *p); 135 struct task_struct *p);
136 extern void mem_cgroup_replace_page_cache(struct page *oldpage, 136 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
137 struct page *newpage); 137 struct page *newpage);
138 138
139 static inline void mem_cgroup_oom_enable(void) 139 static inline void mem_cgroup_oom_enable(void)
140 { 140 {
141 WARN_ON(current->memcg_oom.may_oom); 141 WARN_ON(current->memcg_oom.may_oom);
142 current->memcg_oom.may_oom = 1; 142 current->memcg_oom.may_oom = 1;
143 } 143 }
144 144
145 static inline void mem_cgroup_oom_disable(void) 145 static inline void mem_cgroup_oom_disable(void)
146 { 146 {
147 WARN_ON(!current->memcg_oom.may_oom); 147 WARN_ON(!current->memcg_oom.may_oom);
148 current->memcg_oom.may_oom = 0; 148 current->memcg_oom.may_oom = 0;
149 } 149 }
150 150
151 static inline bool task_in_memcg_oom(struct task_struct *p) 151 static inline bool task_in_memcg_oom(struct task_struct *p)
152 { 152 {
153 return p->memcg_oom.memcg; 153 return p->memcg_oom.memcg;
154 } 154 }
155 155
156 bool mem_cgroup_oom_synchronize(bool wait); 156 bool mem_cgroup_oom_synchronize(bool wait);
157 157
158 #ifdef CONFIG_MEMCG_SWAP 158 #ifdef CONFIG_MEMCG_SWAP
159 extern int do_swap_account; 159 extern int do_swap_account;
160 #endif 160 #endif
161 161
162 static inline bool mem_cgroup_disabled(void) 162 static inline bool mem_cgroup_disabled(void)
163 { 163 {
164 if (memory_cgrp_subsys.disabled) 164 if (memory_cgrp_subsys.disabled)
165 return true; 165 return true;
166 return false; 166 return false;
167 } 167 }
168 168
169 void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, 169 void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
170 unsigned long *flags); 170 unsigned long *flags);
171 171
172 extern atomic_t memcg_moving; 172 extern atomic_t memcg_moving;
173 173
174 static inline void mem_cgroup_begin_update_page_stat(struct page *page, 174 static inline void mem_cgroup_begin_update_page_stat(struct page *page,
175 bool *locked, unsigned long *flags) 175 bool *locked, unsigned long *flags)
176 { 176 {
177 if (mem_cgroup_disabled()) 177 if (mem_cgroup_disabled())
178 return; 178 return;
179 rcu_read_lock(); 179 rcu_read_lock();
180 *locked = false; 180 *locked = false;
181 if (atomic_read(&memcg_moving)) 181 if (atomic_read(&memcg_moving))
182 __mem_cgroup_begin_update_page_stat(page, locked, flags); 182 __mem_cgroup_begin_update_page_stat(page, locked, flags);
183 } 183 }
184 184
185 void __mem_cgroup_end_update_page_stat(struct page *page, 185 void __mem_cgroup_end_update_page_stat(struct page *page,
186 unsigned long *flags); 186 unsigned long *flags);
187 static inline void mem_cgroup_end_update_page_stat(struct page *page, 187 static inline void mem_cgroup_end_update_page_stat(struct page *page,
188 bool *locked, unsigned long *flags) 188 bool *locked, unsigned long *flags)
189 { 189 {
190 if (mem_cgroup_disabled()) 190 if (mem_cgroup_disabled())
191 return; 191 return;
192 if (*locked) 192 if (*locked)
193 __mem_cgroup_end_update_page_stat(page, flags); 193 __mem_cgroup_end_update_page_stat(page, flags);
194 rcu_read_unlock(); 194 rcu_read_unlock();
195 } 195 }
196 196
197 void mem_cgroup_update_page_stat(struct page *page, 197 void mem_cgroup_update_page_stat(struct page *page,
198 enum mem_cgroup_stat_index idx, 198 enum mem_cgroup_stat_index idx,
199 int val); 199 int val);
200 200
201 static inline void mem_cgroup_inc_page_stat(struct page *page, 201 static inline void mem_cgroup_inc_page_stat(struct page *page,
202 enum mem_cgroup_stat_index idx) 202 enum mem_cgroup_stat_index idx)
203 { 203 {
204 mem_cgroup_update_page_stat(page, idx, 1); 204 mem_cgroup_update_page_stat(page, idx, 1);
205 } 205 }
206 206
207 static inline void mem_cgroup_dec_page_stat(struct page *page, 207 static inline void mem_cgroup_dec_page_stat(struct page *page,
208 enum mem_cgroup_stat_index idx) 208 enum mem_cgroup_stat_index idx)
209 { 209 {
210 mem_cgroup_update_page_stat(page, idx, -1); 210 mem_cgroup_update_page_stat(page, idx, -1);
211 } 211 }
212 212
213 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 213 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
214 gfp_t gfp_mask, 214 gfp_t gfp_mask,
215 unsigned long *total_scanned); 215 unsigned long *total_scanned);
216 216
217 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); 217 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx);
218 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, 218 static inline void mem_cgroup_count_vm_event(struct mm_struct *mm,
219 enum vm_event_item idx) 219 enum vm_event_item idx)
220 { 220 {
221 if (mem_cgroup_disabled()) 221 if (mem_cgroup_disabled())
222 return; 222 return;
223 __mem_cgroup_count_vm_event(mm, idx); 223 __mem_cgroup_count_vm_event(mm, idx);
224 } 224 }
225 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 225 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
226 void mem_cgroup_split_huge_fixup(struct page *head); 226 void mem_cgroup_split_huge_fixup(struct page *head);
227 #endif 227 #endif
228 228
229 #ifdef CONFIG_DEBUG_VM 229 #ifdef CONFIG_DEBUG_VM
230 bool mem_cgroup_bad_page_check(struct page *page); 230 bool mem_cgroup_bad_page_check(struct page *page);
231 void mem_cgroup_print_bad_page(struct page *page); 231 void mem_cgroup_print_bad_page(struct page *page);
232 #endif 232 #endif
233 #else /* CONFIG_MEMCG */ 233 #else /* CONFIG_MEMCG */
234 struct mem_cgroup; 234 struct mem_cgroup;
235 235
236 static inline int mem_cgroup_charge_anon(struct page *page, 236 static inline int mem_cgroup_charge_anon(struct page *page,
237 struct mm_struct *mm, gfp_t gfp_mask) 237 struct mm_struct *mm, gfp_t gfp_mask)
238 { 238 {
239 return 0; 239 return 0;
240 } 240 }
241 241
242 static inline int mem_cgroup_charge_file(struct page *page, 242 static inline int mem_cgroup_charge_file(struct page *page,
243 struct mm_struct *mm, gfp_t gfp_mask) 243 struct mm_struct *mm, gfp_t gfp_mask)
244 { 244 {
245 return 0; 245 return 0;
246 } 246 }
247 247
248 static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 248 static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
249 struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) 249 struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
250 { 250 {
251 return 0; 251 return 0;
252 } 252 }
253 253
254 static inline void mem_cgroup_commit_charge_swapin(struct page *page, 254 static inline void mem_cgroup_commit_charge_swapin(struct page *page,
255 struct mem_cgroup *memcg) 255 struct mem_cgroup *memcg)
256 { 256 {
257 } 257 }
258 258
259 static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 259 static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
260 { 260 {
261 } 261 }
262 262
263 static inline void mem_cgroup_uncharge_start(void) 263 static inline void mem_cgroup_uncharge_start(void)
264 { 264 {
265 } 265 }
266 266
267 static inline void mem_cgroup_uncharge_end(void) 267 static inline void mem_cgroup_uncharge_end(void)
268 { 268 {
269 } 269 }
270 270
271 static inline void mem_cgroup_uncharge_page(struct page *page) 271 static inline void mem_cgroup_uncharge_page(struct page *page)
272 { 272 {
273 } 273 }
274 274
275 static inline void mem_cgroup_uncharge_cache_page(struct page *page) 275 static inline void mem_cgroup_uncharge_cache_page(struct page *page)
276 { 276 {
277 } 277 }
278 278
279 static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 279 static inline struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
280 struct mem_cgroup *memcg) 280 struct mem_cgroup *memcg)
281 { 281 {
282 return &zone->lruvec; 282 return &zone->lruvec;
283 } 283 }
284 284
285 static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, 285 static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
286 struct zone *zone) 286 struct zone *zone)
287 { 287 {
288 return &zone->lruvec; 288 return &zone->lruvec;
289 } 289 }
290 290
291 static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 291 static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
292 { 292 {
293 return NULL; 293 return NULL;
294 } 294 }
295 295
296 static inline bool mm_match_cgroup(struct mm_struct *mm, 296 static inline bool mm_match_cgroup(struct mm_struct *mm,
297 struct mem_cgroup *memcg) 297 struct mem_cgroup *memcg)
298 { 298 {
299 return true; 299 return true;
300 } 300 }
301 301
302 static inline bool task_in_mem_cgroup(struct task_struct *task, 302 static inline bool task_in_mem_cgroup(struct task_struct *task,
303 const struct mem_cgroup *memcg) 303 const struct mem_cgroup *memcg)
304 { 304 {
305 return true; 305 return true;
306 } 306 }
307 307
308 static inline struct cgroup_subsys_state 308 static inline struct cgroup_subsys_state
309 *mem_cgroup_css(struct mem_cgroup *memcg) 309 *mem_cgroup_css(struct mem_cgroup *memcg)
310 { 310 {
311 return NULL; 311 return NULL;
312 } 312 }
313 313
314 static inline void 314 static inline void
315 mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 315 mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
316 struct mem_cgroup **memcgp) 316 struct mem_cgroup **memcgp)
317 { 317 {
318 } 318 }
319 319
320 static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, 320 static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
321 struct page *oldpage, struct page *newpage, bool migration_ok) 321 struct page *oldpage, struct page *newpage, bool migration_ok)
322 { 322 {
323 } 323 }
324 324
325 static inline struct mem_cgroup * 325 static inline struct mem_cgroup *
326 mem_cgroup_iter(struct mem_cgroup *root, 326 mem_cgroup_iter(struct mem_cgroup *root,
327 struct mem_cgroup *prev, 327 struct mem_cgroup *prev,
328 struct mem_cgroup_reclaim_cookie *reclaim) 328 struct mem_cgroup_reclaim_cookie *reclaim)
329 { 329 {
330 return NULL; 330 return NULL;
331 } 331 }
332 332
333 static inline void mem_cgroup_iter_break(struct mem_cgroup *root, 333 static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
334 struct mem_cgroup *prev) 334 struct mem_cgroup *prev)
335 { 335 {
336 } 336 }
337 337
338 static inline bool mem_cgroup_disabled(void) 338 static inline bool mem_cgroup_disabled(void)
339 { 339 {
340 return true; 340 return true;
341 } 341 }
342 342
343 static inline int 343 static inline int
344 mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 344 mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
345 { 345 {
346 return 1; 346 return 1;
347 } 347 }
348 348
349 static inline unsigned long 349 static inline unsigned long
350 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 350 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
351 { 351 {
352 return 0; 352 return 0;
353 } 353 }
354 354
355 static inline void 355 static inline void
356 mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 356 mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
357 int increment) 357 int increment)
358 { 358 {
359 } 359 }
360 360
361 static inline void 361 static inline void
362 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 362 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
363 { 363 {
364 } 364 }
365 365
366 static inline void mem_cgroup_begin_update_page_stat(struct page *page, 366 static inline void mem_cgroup_begin_update_page_stat(struct page *page,
367 bool *locked, unsigned long *flags) 367 bool *locked, unsigned long *flags)
368 { 368 {
369 } 369 }
370 370
371 static inline void mem_cgroup_end_update_page_stat(struct page *page, 371 static inline void mem_cgroup_end_update_page_stat(struct page *page,
372 bool *locked, unsigned long *flags) 372 bool *locked, unsigned long *flags)
373 { 373 {
374 } 374 }
375 375
376 static inline void mem_cgroup_oom_enable(void) 376 static inline void mem_cgroup_oom_enable(void)
377 { 377 {
378 } 378 }
379 379
380 static inline void mem_cgroup_oom_disable(void) 380 static inline void mem_cgroup_oom_disable(void)
381 { 381 {
382 } 382 }
383 383
384 static inline bool task_in_memcg_oom(struct task_struct *p) 384 static inline bool task_in_memcg_oom(struct task_struct *p)
385 { 385 {
386 return false; 386 return false;
387 } 387 }
388 388
389 static inline bool mem_cgroup_oom_synchronize(bool wait) 389 static inline bool mem_cgroup_oom_synchronize(bool wait)
390 { 390 {
391 return false; 391 return false;
392 } 392 }
393 393
394 static inline void mem_cgroup_inc_page_stat(struct page *page, 394 static inline void mem_cgroup_inc_page_stat(struct page *page,
395 enum mem_cgroup_stat_index idx) 395 enum mem_cgroup_stat_index idx)
396 { 396 {
397 } 397 }
398 398
399 static inline void mem_cgroup_dec_page_stat(struct page *page, 399 static inline void mem_cgroup_dec_page_stat(struct page *page,
400 enum mem_cgroup_stat_index idx) 400 enum mem_cgroup_stat_index idx)
401 { 401 {
402 } 402 }
403 403
404 static inline 404 static inline
405 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 405 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
406 gfp_t gfp_mask, 406 gfp_t gfp_mask,
407 unsigned long *total_scanned) 407 unsigned long *total_scanned)
408 { 408 {
409 return 0; 409 return 0;
410 } 410 }
411 411
412 static inline void mem_cgroup_split_huge_fixup(struct page *head) 412 static inline void mem_cgroup_split_huge_fixup(struct page *head)
413 { 413 {
414 } 414 }
415 415
416 static inline 416 static inline
417 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 417 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
418 { 418 {
419 } 419 }
420 static inline void mem_cgroup_replace_page_cache(struct page *oldpage, 420 static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
421 struct page *newpage) 421 struct page *newpage)
422 { 422 {
423 } 423 }
424 #endif /* CONFIG_MEMCG */ 424 #endif /* CONFIG_MEMCG */
425 425
426 #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) 426 #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
427 static inline bool 427 static inline bool
428 mem_cgroup_bad_page_check(struct page *page) 428 mem_cgroup_bad_page_check(struct page *page)
429 { 429 {
430 return false; 430 return false;
431 } 431 }
432 432
433 static inline void 433 static inline void
434 mem_cgroup_print_bad_page(struct page *page) 434 mem_cgroup_print_bad_page(struct page *page)
435 { 435 {
436 } 436 }
437 #endif 437 #endif
438 438
439 enum { 439 enum {
440 UNDER_LIMIT, 440 UNDER_LIMIT,
441 SOFT_LIMIT, 441 SOFT_LIMIT,
442 OVER_LIMIT, 442 OVER_LIMIT,
443 }; 443 };
444 444
445 struct sock; 445 struct sock;
446 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 446 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
447 void sock_update_memcg(struct sock *sk); 447 void sock_update_memcg(struct sock *sk);
448 void sock_release_memcg(struct sock *sk); 448 void sock_release_memcg(struct sock *sk);
449 #else 449 #else
450 static inline void sock_update_memcg(struct sock *sk) 450 static inline void sock_update_memcg(struct sock *sk)
451 { 451 {
452 } 452 }
453 static inline void sock_release_memcg(struct sock *sk) 453 static inline void sock_release_memcg(struct sock *sk)
454 { 454 {
455 } 455 }
456 #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */ 456 #endif /* CONFIG_INET && CONFIG_MEMCG_KMEM */
457 457
458 #ifdef CONFIG_MEMCG_KMEM 458 #ifdef CONFIG_MEMCG_KMEM
459 extern struct static_key memcg_kmem_enabled_key; 459 extern struct static_key memcg_kmem_enabled_key;
460 460
461 extern int memcg_limited_groups_array_size; 461 extern int memcg_limited_groups_array_size;
462 462
463 /* 463 /*
464 * Helper macro to loop through all memcg-specific caches. Callers must still 464 * Helper macro to loop through all memcg-specific caches. Callers must still
465 * check if the cache is valid (it is either valid or NULL). 465 * check if the cache is valid (it is either valid or NULL).
466 * the slab_mutex must be held when looping through those caches 466 * the slab_mutex must be held when looping through those caches
467 */ 467 */
468 #define for_each_memcg_cache_index(_idx) \ 468 #define for_each_memcg_cache_index(_idx) \
469 for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++) 469 for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
470 470
471 static inline bool memcg_kmem_enabled(void) 471 static inline bool memcg_kmem_enabled(void)
472 { 472 {
473 return static_key_false(&memcg_kmem_enabled_key); 473 return static_key_false(&memcg_kmem_enabled_key);
474 } 474 }
475 475
476 /* 476 /*
477 * In general, we'll do everything in our power to not incur in any overhead 477 * In general, we'll do everything in our power to not incur in any overhead
478 * for non-memcg users for the kmem functions. Not even a function call, if we 478 * for non-memcg users for the kmem functions. Not even a function call, if we
479 * can avoid it. 479 * can avoid it.
480 * 480 *
481 * Therefore, we'll inline all those functions so that in the best case, we'll 481 * Therefore, we'll inline all those functions so that in the best case, we'll
482 * see that kmemcg is off for everybody and proceed quickly. If it is on, 482 * see that kmemcg is off for everybody and proceed quickly. If it is on,
483 * we'll still do most of the flag checking inline. We check a lot of 483 * we'll still do most of the flag checking inline. We check a lot of
484 * conditions, but because they are pretty simple, they are expected to be 484 * conditions, but because they are pretty simple, they are expected to be
485 * fast. 485 * fast.
486 */ 486 */
487 bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, 487 bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
488 int order); 488 int order);
489 void __memcg_kmem_commit_charge(struct page *page, 489 void __memcg_kmem_commit_charge(struct page *page,
490 struct mem_cgroup *memcg, int order); 490 struct mem_cgroup *memcg, int order);
491 void __memcg_kmem_uncharge_pages(struct page *page, int order); 491 void __memcg_kmem_uncharge_pages(struct page *page, int order);
492 492
493 int memcg_cache_id(struct mem_cgroup *memcg); 493 int memcg_cache_id(struct mem_cgroup *memcg);
494 494
495 char *memcg_create_cache_name(struct mem_cgroup *memcg, 495 char *memcg_create_cache_name(struct mem_cgroup *memcg,
496 struct kmem_cache *root_cache); 496 struct kmem_cache *root_cache);
497 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, 497 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
498 struct kmem_cache *root_cache); 498 struct kmem_cache *root_cache);
499 void memcg_free_cache_params(struct kmem_cache *s); 499 void memcg_free_cache_params(struct kmem_cache *s);
500 void memcg_register_cache(struct kmem_cache *s); 500 void memcg_register_cache(struct kmem_cache *s);
501 void memcg_unregister_cache(struct kmem_cache *s); 501 void memcg_unregister_cache(struct kmem_cache *s);
502 502
503 int memcg_update_cache_size(struct kmem_cache *s, int num_groups); 503 int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
504 void memcg_update_array_size(int num_groups); 504 void memcg_update_array_size(int num_groups);
505 505
506 struct kmem_cache * 506 struct kmem_cache *
507 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); 507 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
508 508
509 void mem_cgroup_destroy_cache(struct kmem_cache *cachep); 509 void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
510 void kmem_cache_destroy_memcg_children(struct kmem_cache *s); 510 void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
511 511
512 /** 512 /**
513 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. 513 * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
514 * @gfp: the gfp allocation flags. 514 * @gfp: the gfp allocation flags.
515 * @memcg: a pointer to the memcg this was charged against. 515 * @memcg: a pointer to the memcg this was charged against.
516 * @order: allocation order. 516 * @order: allocation order.
517 * 517 *
518 * returns true if the memcg where the current task belongs can hold this 518 * returns true if the memcg where the current task belongs can hold this
519 * allocation. 519 * allocation.
520 * 520 *
521 * We return true automatically if this allocation is not to be accounted to 521 * We return true automatically if this allocation is not to be accounted to
522 * any memcg. 522 * any memcg.
523 */ 523 */
524 static inline bool 524 static inline bool
525 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) 525 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
526 { 526 {
527 if (!memcg_kmem_enabled()) 527 if (!memcg_kmem_enabled())
528 return true; 528 return true;
529 529
530 /* 530 /*
531 * __GFP_NOFAIL allocations will move on even if charging is not 531 * __GFP_NOFAIL allocations will move on even if charging is not
532 * possible. Therefore we don't even try, and have this allocation 532 * possible. Therefore we don't even try, and have this allocation
533 * unaccounted. We could in theory charge it with 533 * unaccounted. We could in theory charge it with
534 * res_counter_charge_nofail, but we hope those allocations are rare, 534 * res_counter_charge_nofail, but we hope those allocations are rare,
535 * and won't be worth the trouble. 535 * and won't be worth the trouble.
536 */ 536 */
537 if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) 537 if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
538 return true; 538 return true;
539 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) 539 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
540 return true; 540 return true;
541 541
542 /* If the test is dying, just let it go. */ 542 /* If the test is dying, just let it go. */
543 if (unlikely(fatal_signal_pending(current))) 543 if (unlikely(fatal_signal_pending(current)))
544 return true; 544 return true;
545 545
546 return __memcg_kmem_newpage_charge(gfp, memcg, order); 546 return __memcg_kmem_newpage_charge(gfp, memcg, order);
547 } 547 }
548 548
549 /** 549 /**
550 * memcg_kmem_uncharge_pages: uncharge pages from memcg 550 * memcg_kmem_uncharge_pages: uncharge pages from memcg
551 * @page: pointer to struct page being freed 551 * @page: pointer to struct page being freed
552 * @order: allocation order. 552 * @order: allocation order.
553 * 553 *
554 * there is no need to specify memcg here, since it is embedded in page_cgroup 554 * there is no need to specify memcg here, since it is embedded in page_cgroup
555 */ 555 */
556 static inline void 556 static inline void
557 memcg_kmem_uncharge_pages(struct page *page, int order) 557 memcg_kmem_uncharge_pages(struct page *page, int order)
558 { 558 {
559 if (memcg_kmem_enabled()) 559 if (memcg_kmem_enabled())
560 __memcg_kmem_uncharge_pages(page, order); 560 __memcg_kmem_uncharge_pages(page, order);
561 } 561 }
562 562
563 /** 563 /**
564 * memcg_kmem_commit_charge: embeds correct memcg in a page 564 * memcg_kmem_commit_charge: embeds correct memcg in a page
565 * @page: pointer to struct page recently allocated 565 * @page: pointer to struct page recently allocated
566 * @memcg: the memcg structure we charged against 566 * @memcg: the memcg structure we charged against
567 * @order: allocation order. 567 * @order: allocation order.
568 * 568 *
569 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or 569 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
570 * failure of the allocation. if @page is NULL, this function will revert the 570 * failure of the allocation. if @page is NULL, this function will revert the
571 * charges. Otherwise, it will commit the memcg given by @memcg to the 571 * charges. Otherwise, it will commit the memcg given by @memcg to the
572 * corresponding page_cgroup. 572 * corresponding page_cgroup.
573 */ 573 */
574 static inline void 574 static inline void
575 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) 575 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
576 { 576 {
577 if (memcg_kmem_enabled() && memcg) 577 if (memcg_kmem_enabled() && memcg)
578 __memcg_kmem_commit_charge(page, memcg, order); 578 __memcg_kmem_commit_charge(page, memcg, order);
579 } 579 }
580 580
581 /** 581 /**
582 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation 582 * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
583 * @cachep: the original global kmem cache 583 * @cachep: the original global kmem cache
584 * @gfp: allocation flags. 584 * @gfp: allocation flags.
585 * 585 *
586 * This function assumes that the task allocating, which determines the memcg 586 * This function assumes that the task allocating, which determines the memcg
587 * in the page allocator, belongs to the same cgroup throughout the whole 587 * in the page allocator, belongs to the same cgroup throughout the whole
588 * process. Misacounting can happen if the task calls memcg_kmem_get_cache() 588 * process. Misacounting can happen if the task calls memcg_kmem_get_cache()
589 * while belonging to a cgroup, and later on changes. This is considered 589 * while belonging to a cgroup, and later on changes. This is considered
590 * acceptable, and should only happen upon task migration. 590 * acceptable, and should only happen upon task migration.
591 * 591 *
592 * Before the cache is created by the memcg core, there is also a possible 592 * Before the cache is created by the memcg core, there is also a possible
593 * imbalance: the task belongs to a memcg, but the cache being allocated from 593 * imbalance: the task belongs to a memcg, but the cache being allocated from
594 * is the global cache, since the child cache is not yet guaranteed to be 594 * is the global cache, since the child cache is not yet guaranteed to be
595 * ready. This case is also fine, since in this case the GFP_KMEMCG will not be 595 * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
596 * passed and the page allocator will not attempt any cgroup accounting. 596 * passed and the page allocator will not attempt any cgroup accounting.
597 */ 597 */
598 static __always_inline struct kmem_cache * 598 static __always_inline struct kmem_cache *
599 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 599 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
600 { 600 {
601 if (!memcg_kmem_enabled()) 601 if (!memcg_kmem_enabled())
602 return cachep; 602 return cachep;
603 if (gfp & __GFP_NOFAIL) 603 if (gfp & __GFP_NOFAIL)
604 return cachep; 604 return cachep;
605 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) 605 if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
606 return cachep; 606 return cachep;
607 if (unlikely(fatal_signal_pending(current))) 607 if (unlikely(fatal_signal_pending(current)))
608 return cachep; 608 return cachep;
609 609
610 return __memcg_kmem_get_cache(cachep, gfp); 610 return __memcg_kmem_get_cache(cachep, gfp);
611 } 611 }
612 #else 612 #else
613 #define for_each_memcg_cache_index(_idx) \ 613 #define for_each_memcg_cache_index(_idx) \
614 for (; NULL; ) 614 for (; NULL; )
615 615
616 static inline bool memcg_kmem_enabled(void) 616 static inline bool memcg_kmem_enabled(void)
617 { 617 {
618 return false; 618 return false;
619 } 619 }
620 620
621 static inline bool 621 static inline bool
622 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) 622 memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
623 { 623 {
624 return true; 624 return true;
625 } 625 }
626 626
627 static inline void memcg_kmem_uncharge_pages(struct page *page, int order) 627 static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
628 { 628 {
629 } 629 }
630 630
631 static inline void 631 static inline void
632 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) 632 memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
633 { 633 {
634 } 634 }
635 635
636 static inline int memcg_cache_id(struct mem_cgroup *memcg) 636 static inline int memcg_cache_id(struct mem_cgroup *memcg)
637 { 637 {
638 return -1; 638 return -1;
639 } 639 }
640 640
641 static inline char *memcg_create_cache_name(struct mem_cgroup *memcg,
642 struct kmem_cache *root_cache)
643 {
644 return NULL;
645 }
646
647 static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, 641 static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
648 struct kmem_cache *s, struct kmem_cache *root_cache) 642 struct kmem_cache *s, struct kmem_cache *root_cache)
649 { 643 {
650 return 0; 644 return 0;
651 } 645 }
652 646
653 static inline void memcg_free_cache_params(struct kmem_cache *s) 647 static inline void memcg_free_cache_params(struct kmem_cache *s)
654 { 648 {
655 } 649 }
656 650
657 static inline void memcg_register_cache(struct kmem_cache *s) 651 static inline void memcg_register_cache(struct kmem_cache *s)
658 { 652 {
659 } 653 }
660 654
661 static inline void memcg_unregister_cache(struct kmem_cache *s) 655 static inline void memcg_unregister_cache(struct kmem_cache *s)
662 { 656 {
663 } 657 }
664 658
665 static inline struct kmem_cache * 659 static inline struct kmem_cache *
666 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 660 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
667 { 661 {
668 return cachep; 662 return cachep;
669 } 663 }
670 664
671 static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 665 static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
672 { 666 {
673 } 667 }
674 #endif /* CONFIG_MEMCG_KMEM */ 668 #endif /* CONFIG_MEMCG_KMEM */
675 #endif /* _LINUX_MEMCONTROL_H */ 669 #endif /* _LINUX_MEMCONTROL_H */
676 670
677 671
include/linux/slab.h
1 /* 1 /*
2 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk). 2 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
3 * 3 *
4 * (C) SGI 2006, Christoph Lameter 4 * (C) SGI 2006, Christoph Lameter
5 * Cleaned up and restructured to ease the addition of alternative 5 * Cleaned up and restructured to ease the addition of alternative
6 * implementations of SLAB allocators. 6 * implementations of SLAB allocators.
7 * (C) Linux Foundation 2008-2013 7 * (C) Linux Foundation 2008-2013
8 * Unified interface for all slab allocators 8 * Unified interface for all slab allocators
9 */ 9 */
10 10
11 #ifndef _LINUX_SLAB_H 11 #ifndef _LINUX_SLAB_H
12 #define _LINUX_SLAB_H 12 #define _LINUX_SLAB_H
13 13
14 #include <linux/gfp.h> 14 #include <linux/gfp.h>
15 #include <linux/types.h> 15 #include <linux/types.h>
16 #include <linux/workqueue.h> 16 #include <linux/workqueue.h>
17 17
18 18
19 /* 19 /*
20 * Flags to pass to kmem_cache_create(). 20 * Flags to pass to kmem_cache_create().
21 * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set. 21 * The ones marked DEBUG are only valid if CONFIG_SLAB_DEBUG is set.
22 */ 22 */
23 #define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */ 23 #define SLAB_DEBUG_FREE 0x00000100UL /* DEBUG: Perform (expensive) checks on free */
24 #define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ 24 #define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */
25 #define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ 25 #define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */
26 #define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ 26 #define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */
27 #define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ 27 #define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */
28 #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ 28 #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */
29 #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ 29 #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */
30 /* 30 /*
31 * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! 31 * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS!
32 * 32 *
33 * This delays freeing the SLAB page by a grace period, it does _NOT_ 33 * This delays freeing the SLAB page by a grace period, it does _NOT_
34 * delay object freeing. This means that if you do kmem_cache_free() 34 * delay object freeing. This means that if you do kmem_cache_free()
35 * that memory location is free to be reused at any time. Thus it may 35 * that memory location is free to be reused at any time. Thus it may
36 * be possible to see another object there in the same RCU grace period. 36 * be possible to see another object there in the same RCU grace period.
37 * 37 *
38 * This feature only ensures the memory location backing the object 38 * This feature only ensures the memory location backing the object
39 * stays valid, the trick to using this is relying on an independent 39 * stays valid, the trick to using this is relying on an independent
40 * object validation pass. Something like: 40 * object validation pass. Something like:
41 * 41 *
42 * rcu_read_lock() 42 * rcu_read_lock()
43 * again: 43 * again:
44 * obj = lockless_lookup(key); 44 * obj = lockless_lookup(key);
45 * if (obj) { 45 * if (obj) {
46 * if (!try_get_ref(obj)) // might fail for free objects 46 * if (!try_get_ref(obj)) // might fail for free objects
47 * goto again; 47 * goto again;
48 * 48 *
49 * if (obj->key != key) { // not the object we expected 49 * if (obj->key != key) { // not the object we expected
50 * put_ref(obj); 50 * put_ref(obj);
51 * goto again; 51 * goto again;
52 * } 52 * }
53 * } 53 * }
54 * rcu_read_unlock(); 54 * rcu_read_unlock();
55 * 55 *
56 * This is useful if we need to approach a kernel structure obliquely, 56 * This is useful if we need to approach a kernel structure obliquely,
57 * from its address obtained without the usual locking. We can lock 57 * from its address obtained without the usual locking. We can lock
58 * the structure to stabilize it and check it's still at the given address, 58 * the structure to stabilize it and check it's still at the given address,
59 * only if we can be sure that the memory has not been meanwhile reused 59 * only if we can be sure that the memory has not been meanwhile reused
60 * for some other kind of object (which our subsystem's lock might corrupt). 60 * for some other kind of object (which our subsystem's lock might corrupt).
61 * 61 *
62 * rcu_read_lock before reading the address, then rcu_read_unlock after 62 * rcu_read_lock before reading the address, then rcu_read_unlock after
63 * taking the spinlock within the structure expected at that address. 63 * taking the spinlock within the structure expected at that address.
64 */ 64 */
65 #define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ 65 #define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */
66 #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ 66 #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */
67 #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ 67 #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */
68 68
69 /* Flag to prevent checks on free */ 69 /* Flag to prevent checks on free */
70 #ifdef CONFIG_DEBUG_OBJECTS 70 #ifdef CONFIG_DEBUG_OBJECTS
71 # define SLAB_DEBUG_OBJECTS 0x00400000UL 71 # define SLAB_DEBUG_OBJECTS 0x00400000UL
72 #else 72 #else
73 # define SLAB_DEBUG_OBJECTS 0x00000000UL 73 # define SLAB_DEBUG_OBJECTS 0x00000000UL
74 #endif 74 #endif
75 75
76 #define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ 76 #define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */
77 77
78 /* Don't track use of uninitialized memory */ 78 /* Don't track use of uninitialized memory */
79 #ifdef CONFIG_KMEMCHECK 79 #ifdef CONFIG_KMEMCHECK
80 # define SLAB_NOTRACK 0x01000000UL 80 # define SLAB_NOTRACK 0x01000000UL
81 #else 81 #else
82 # define SLAB_NOTRACK 0x00000000UL 82 # define SLAB_NOTRACK 0x00000000UL
83 #endif 83 #endif
84 #ifdef CONFIG_FAILSLAB 84 #ifdef CONFIG_FAILSLAB
85 # define SLAB_FAILSLAB 0x02000000UL /* Fault injection mark */ 85 # define SLAB_FAILSLAB 0x02000000UL /* Fault injection mark */
86 #else 86 #else
87 # define SLAB_FAILSLAB 0x00000000UL 87 # define SLAB_FAILSLAB 0x00000000UL
88 #endif 88 #endif
89 89
90 /* The following flags affect the page allocator grouping pages by mobility */ 90 /* The following flags affect the page allocator grouping pages by mobility */
91 #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ 91 #define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */
92 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 92 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */
93 /* 93 /*
94 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. 94 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
95 * 95 *
96 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault. 96 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
97 * 97 *
98 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can. 98 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
99 * Both make kfree a no-op. 99 * Both make kfree a no-op.
100 */ 100 */
101 #define ZERO_SIZE_PTR ((void *)16) 101 #define ZERO_SIZE_PTR ((void *)16)
102 102
103 #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ 103 #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
104 (unsigned long)ZERO_SIZE_PTR) 104 (unsigned long)ZERO_SIZE_PTR)
105 105
106 #include <linux/kmemleak.h> 106 #include <linux/kmemleak.h>
107 107
108 struct mem_cgroup; 108 struct mem_cgroup;
109 /* 109 /*
110 * struct kmem_cache related prototypes 110 * struct kmem_cache related prototypes
111 */ 111 */
112 void __init kmem_cache_init(void); 112 void __init kmem_cache_init(void);
113 int slab_is_available(void); 113 int slab_is_available(void);
114 114
115 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, 115 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
116 unsigned long, 116 unsigned long,
117 void (*)(void *)); 117 void (*)(void *));
118 struct kmem_cache * 118 #ifdef CONFIG_MEMCG_KMEM
119 kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t, 119 void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *);
120 unsigned long, void (*)(void *), struct kmem_cache *); 120 #endif
121 void kmem_cache_destroy(struct kmem_cache *); 121 void kmem_cache_destroy(struct kmem_cache *);
122 int kmem_cache_shrink(struct kmem_cache *); 122 int kmem_cache_shrink(struct kmem_cache *);
123 void kmem_cache_free(struct kmem_cache *, void *); 123 void kmem_cache_free(struct kmem_cache *, void *);
124 124
125 /* 125 /*
126 * Please use this macro to create slab caches. Simply specify the 126 * Please use this macro to create slab caches. Simply specify the
127 * name of the structure and maybe some flags that are listed above. 127 * name of the structure and maybe some flags that are listed above.
128 * 128 *
129 * The alignment of the struct determines object alignment. If you 129 * The alignment of the struct determines object alignment. If you
130 * f.e. add ____cacheline_aligned_in_smp to the struct declaration 130 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
131 * then the objects will be properly aligned in SMP configurations. 131 * then the objects will be properly aligned in SMP configurations.
132 */ 132 */
133 #define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\ 133 #define KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\
134 sizeof(struct __struct), __alignof__(struct __struct),\ 134 sizeof(struct __struct), __alignof__(struct __struct),\
135 (__flags), NULL) 135 (__flags), NULL)
136 136
137 /* 137 /*
138 * Common kmalloc functions provided by all allocators 138 * Common kmalloc functions provided by all allocators
139 */ 139 */
140 void * __must_check __krealloc(const void *, size_t, gfp_t); 140 void * __must_check __krealloc(const void *, size_t, gfp_t);
141 void * __must_check krealloc(const void *, size_t, gfp_t); 141 void * __must_check krealloc(const void *, size_t, gfp_t);
142 void kfree(const void *); 142 void kfree(const void *);
143 void kzfree(const void *); 143 void kzfree(const void *);
144 size_t ksize(const void *); 144 size_t ksize(const void *);
145 145
146 /* 146 /*
147 * Some archs want to perform DMA into kmalloc caches and need a guaranteed 147 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
148 * alignment larger than the alignment of a 64-bit integer. 148 * alignment larger than the alignment of a 64-bit integer.
149 * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that. 149 * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
150 */ 150 */
151 #if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8 151 #if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
152 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN 152 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
153 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN 153 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
154 #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) 154 #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
155 #else 155 #else
156 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) 156 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
157 #endif 157 #endif
158 158
159 #ifdef CONFIG_SLOB 159 #ifdef CONFIG_SLOB
160 /* 160 /*
161 * Common fields provided in kmem_cache by all slab allocators 161 * Common fields provided in kmem_cache by all slab allocators
162 * This struct is either used directly by the allocator (SLOB) 162 * This struct is either used directly by the allocator (SLOB)
163 * or the allocator must include definitions for all fields 163 * or the allocator must include definitions for all fields
164 * provided in kmem_cache_common in their definition of kmem_cache. 164 * provided in kmem_cache_common in their definition of kmem_cache.
165 * 165 *
166 * Once we can do anonymous structs (C11 standard) we could put a 166 * Once we can do anonymous structs (C11 standard) we could put a
167 * anonymous struct definition in these allocators so that the 167 * anonymous struct definition in these allocators so that the
168 * separate allocations in the kmem_cache structure of SLAB and 168 * separate allocations in the kmem_cache structure of SLAB and
169 * SLUB is no longer needed. 169 * SLUB is no longer needed.
170 */ 170 */
171 struct kmem_cache { 171 struct kmem_cache {
172 unsigned int object_size;/* The original size of the object */ 172 unsigned int object_size;/* The original size of the object */
173 unsigned int size; /* The aligned/padded/added on size */ 173 unsigned int size; /* The aligned/padded/added on size */
174 unsigned int align; /* Alignment as calculated */ 174 unsigned int align; /* Alignment as calculated */
175 unsigned long flags; /* Active flags on the slab */ 175 unsigned long flags; /* Active flags on the slab */
176 const char *name; /* Slab name for sysfs */ 176 const char *name; /* Slab name for sysfs */
177 int refcount; /* Use counter */ 177 int refcount; /* Use counter */
178 void (*ctor)(void *); /* Called on object slot creation */ 178 void (*ctor)(void *); /* Called on object slot creation */
179 struct list_head list; /* List of all slab caches on the system */ 179 struct list_head list; /* List of all slab caches on the system */
180 }; 180 };
181 181
182 #endif /* CONFIG_SLOB */ 182 #endif /* CONFIG_SLOB */
183 183
184 /* 184 /*
185 * Kmalloc array related definitions 185 * Kmalloc array related definitions
186 */ 186 */
187 187
188 #ifdef CONFIG_SLAB 188 #ifdef CONFIG_SLAB
189 /* 189 /*
190 * The largest kmalloc size supported by the SLAB allocators is 190 * The largest kmalloc size supported by the SLAB allocators is
191 * 32 megabyte (2^25) or the maximum allocatable page order if that is 191 * 32 megabyte (2^25) or the maximum allocatable page order if that is
192 * less than 32 MB. 192 * less than 32 MB.
193 * 193 *
194 * WARNING: Its not easy to increase this value since the allocators have 194 * WARNING: Its not easy to increase this value since the allocators have
195 * to do various tricks to work around compiler limitations in order to 195 * to do various tricks to work around compiler limitations in order to
196 * ensure proper constant folding. 196 * ensure proper constant folding.
197 */ 197 */
198 #define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ 198 #define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
199 (MAX_ORDER + PAGE_SHIFT - 1) : 25) 199 (MAX_ORDER + PAGE_SHIFT - 1) : 25)
200 #define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH 200 #define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH
201 #ifndef KMALLOC_SHIFT_LOW 201 #ifndef KMALLOC_SHIFT_LOW
202 #define KMALLOC_SHIFT_LOW 5 202 #define KMALLOC_SHIFT_LOW 5
203 #endif 203 #endif
204 #endif 204 #endif
205 205
206 #ifdef CONFIG_SLUB 206 #ifdef CONFIG_SLUB
207 /* 207 /*
208 * SLUB directly allocates requests fitting in to an order-1 page 208 * SLUB directly allocates requests fitting in to an order-1 page
209 * (PAGE_SIZE*2). Larger requests are passed to the page allocator. 209 * (PAGE_SIZE*2). Larger requests are passed to the page allocator.
210 */ 210 */
211 #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) 211 #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1)
212 #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) 212 #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT)
213 #ifndef KMALLOC_SHIFT_LOW 213 #ifndef KMALLOC_SHIFT_LOW
214 #define KMALLOC_SHIFT_LOW 3 214 #define KMALLOC_SHIFT_LOW 3
215 #endif 215 #endif
216 #endif 216 #endif
217 217
218 #ifdef CONFIG_SLOB 218 #ifdef CONFIG_SLOB
219 /* 219 /*
220 * SLOB passes all requests larger than one page to the page allocator. 220 * SLOB passes all requests larger than one page to the page allocator.
221 * No kmalloc array is necessary since objects of different sizes can 221 * No kmalloc array is necessary since objects of different sizes can
222 * be allocated from the same page. 222 * be allocated from the same page.
223 */ 223 */
224 #define KMALLOC_SHIFT_HIGH PAGE_SHIFT 224 #define KMALLOC_SHIFT_HIGH PAGE_SHIFT
225 #define KMALLOC_SHIFT_MAX 30 225 #define KMALLOC_SHIFT_MAX 30
226 #ifndef KMALLOC_SHIFT_LOW 226 #ifndef KMALLOC_SHIFT_LOW
227 #define KMALLOC_SHIFT_LOW 3 227 #define KMALLOC_SHIFT_LOW 3
228 #endif 228 #endif
229 #endif 229 #endif
230 230
231 /* Maximum allocatable size */ 231 /* Maximum allocatable size */
232 #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) 232 #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX)
233 /* Maximum size for which we actually use a slab cache */ 233 /* Maximum size for which we actually use a slab cache */
234 #define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH) 234 #define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH)
235 /* Maximum order allocatable via the slab allocagtor */ 235 /* Maximum order allocatable via the slab allocagtor */
236 #define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT) 236 #define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT)
237 237
238 /* 238 /*
239 * Kmalloc subsystem. 239 * Kmalloc subsystem.
240 */ 240 */
241 #ifndef KMALLOC_MIN_SIZE 241 #ifndef KMALLOC_MIN_SIZE
242 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW) 242 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
243 #endif 243 #endif
244 244
245 #ifndef CONFIG_SLOB 245 #ifndef CONFIG_SLOB
246 extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; 246 extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
247 #ifdef CONFIG_ZONE_DMA 247 #ifdef CONFIG_ZONE_DMA
248 extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; 248 extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
249 #endif 249 #endif
250 250
251 /* 251 /*
252 * Figure out which kmalloc slab an allocation of a certain size 252 * Figure out which kmalloc slab an allocation of a certain size
253 * belongs to. 253 * belongs to.
254 * 0 = zero alloc 254 * 0 = zero alloc
255 * 1 = 65 .. 96 bytes 255 * 1 = 65 .. 96 bytes
256 * 2 = 120 .. 192 bytes 256 * 2 = 120 .. 192 bytes
257 * n = 2^(n-1) .. 2^n -1 257 * n = 2^(n-1) .. 2^n -1
258 */ 258 */
259 static __always_inline int kmalloc_index(size_t size) 259 static __always_inline int kmalloc_index(size_t size)
260 { 260 {
261 if (!size) 261 if (!size)
262 return 0; 262 return 0;
263 263
264 if (size <= KMALLOC_MIN_SIZE) 264 if (size <= KMALLOC_MIN_SIZE)
265 return KMALLOC_SHIFT_LOW; 265 return KMALLOC_SHIFT_LOW;
266 266
267 if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96) 267 if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
268 return 1; 268 return 1;
269 if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192) 269 if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
270 return 2; 270 return 2;
271 if (size <= 8) return 3; 271 if (size <= 8) return 3;
272 if (size <= 16) return 4; 272 if (size <= 16) return 4;
273 if (size <= 32) return 5; 273 if (size <= 32) return 5;
274 if (size <= 64) return 6; 274 if (size <= 64) return 6;
275 if (size <= 128) return 7; 275 if (size <= 128) return 7;
276 if (size <= 256) return 8; 276 if (size <= 256) return 8;
277 if (size <= 512) return 9; 277 if (size <= 512) return 9;
278 if (size <= 1024) return 10; 278 if (size <= 1024) return 10;
279 if (size <= 2 * 1024) return 11; 279 if (size <= 2 * 1024) return 11;
280 if (size <= 4 * 1024) return 12; 280 if (size <= 4 * 1024) return 12;
281 if (size <= 8 * 1024) return 13; 281 if (size <= 8 * 1024) return 13;
282 if (size <= 16 * 1024) return 14; 282 if (size <= 16 * 1024) return 14;
283 if (size <= 32 * 1024) return 15; 283 if (size <= 32 * 1024) return 15;
284 if (size <= 64 * 1024) return 16; 284 if (size <= 64 * 1024) return 16;
285 if (size <= 128 * 1024) return 17; 285 if (size <= 128 * 1024) return 17;
286 if (size <= 256 * 1024) return 18; 286 if (size <= 256 * 1024) return 18;
287 if (size <= 512 * 1024) return 19; 287 if (size <= 512 * 1024) return 19;
288 if (size <= 1024 * 1024) return 20; 288 if (size <= 1024 * 1024) return 20;
289 if (size <= 2 * 1024 * 1024) return 21; 289 if (size <= 2 * 1024 * 1024) return 21;
290 if (size <= 4 * 1024 * 1024) return 22; 290 if (size <= 4 * 1024 * 1024) return 22;
291 if (size <= 8 * 1024 * 1024) return 23; 291 if (size <= 8 * 1024 * 1024) return 23;
292 if (size <= 16 * 1024 * 1024) return 24; 292 if (size <= 16 * 1024 * 1024) return 24;
293 if (size <= 32 * 1024 * 1024) return 25; 293 if (size <= 32 * 1024 * 1024) return 25;
294 if (size <= 64 * 1024 * 1024) return 26; 294 if (size <= 64 * 1024 * 1024) return 26;
295 BUG(); 295 BUG();
296 296
297 /* Will never be reached. Needed because the compiler may complain */ 297 /* Will never be reached. Needed because the compiler may complain */
298 return -1; 298 return -1;
299 } 299 }
300 #endif /* !CONFIG_SLOB */ 300 #endif /* !CONFIG_SLOB */
301 301
302 void *__kmalloc(size_t size, gfp_t flags); 302 void *__kmalloc(size_t size, gfp_t flags);
303 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); 303 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
304 304
305 #ifdef CONFIG_NUMA 305 #ifdef CONFIG_NUMA
306 void *__kmalloc_node(size_t size, gfp_t flags, int node); 306 void *__kmalloc_node(size_t size, gfp_t flags, int node);
307 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node); 307 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
308 #else 308 #else
309 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) 309 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
310 { 310 {
311 return __kmalloc(size, flags); 311 return __kmalloc(size, flags);
312 } 312 }
313 313
314 static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) 314 static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
315 { 315 {
316 return kmem_cache_alloc(s, flags); 316 return kmem_cache_alloc(s, flags);
317 } 317 }
318 #endif 318 #endif
319 319
320 #ifdef CONFIG_TRACING 320 #ifdef CONFIG_TRACING
321 extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t); 321 extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
322 322
323 #ifdef CONFIG_NUMA 323 #ifdef CONFIG_NUMA
324 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, 324 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
325 gfp_t gfpflags, 325 gfp_t gfpflags,
326 int node, size_t size); 326 int node, size_t size);
327 #else 327 #else
328 static __always_inline void * 328 static __always_inline void *
329 kmem_cache_alloc_node_trace(struct kmem_cache *s, 329 kmem_cache_alloc_node_trace(struct kmem_cache *s,
330 gfp_t gfpflags, 330 gfp_t gfpflags,
331 int node, size_t size) 331 int node, size_t size)
332 { 332 {
333 return kmem_cache_alloc_trace(s, gfpflags, size); 333 return kmem_cache_alloc_trace(s, gfpflags, size);
334 } 334 }
335 #endif /* CONFIG_NUMA */ 335 #endif /* CONFIG_NUMA */
336 336
337 #else /* CONFIG_TRACING */ 337 #else /* CONFIG_TRACING */
338 static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, 338 static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
339 gfp_t flags, size_t size) 339 gfp_t flags, size_t size)
340 { 340 {
341 return kmem_cache_alloc(s, flags); 341 return kmem_cache_alloc(s, flags);
342 } 342 }
343 343
344 static __always_inline void * 344 static __always_inline void *
345 kmem_cache_alloc_node_trace(struct kmem_cache *s, 345 kmem_cache_alloc_node_trace(struct kmem_cache *s,
346 gfp_t gfpflags, 346 gfp_t gfpflags,
347 int node, size_t size) 347 int node, size_t size)
348 { 348 {
349 return kmem_cache_alloc_node(s, gfpflags, node); 349 return kmem_cache_alloc_node(s, gfpflags, node);
350 } 350 }
351 #endif /* CONFIG_TRACING */ 351 #endif /* CONFIG_TRACING */
352 352
353 #ifdef CONFIG_SLAB 353 #ifdef CONFIG_SLAB
354 #include <linux/slab_def.h> 354 #include <linux/slab_def.h>
355 #endif 355 #endif
356 356
357 #ifdef CONFIG_SLUB 357 #ifdef CONFIG_SLUB
358 #include <linux/slub_def.h> 358 #include <linux/slub_def.h>
359 #endif 359 #endif
360 360
361 static __always_inline void * 361 static __always_inline void *
362 kmalloc_order(size_t size, gfp_t flags, unsigned int order) 362 kmalloc_order(size_t size, gfp_t flags, unsigned int order)
363 { 363 {
364 void *ret; 364 void *ret;
365 365
366 flags |= (__GFP_COMP | __GFP_KMEMCG); 366 flags |= (__GFP_COMP | __GFP_KMEMCG);
367 ret = (void *) __get_free_pages(flags, order); 367 ret = (void *) __get_free_pages(flags, order);
368 kmemleak_alloc(ret, size, 1, flags); 368 kmemleak_alloc(ret, size, 1, flags);
369 return ret; 369 return ret;
370 } 370 }
371 371
372 #ifdef CONFIG_TRACING 372 #ifdef CONFIG_TRACING
373 extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); 373 extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
374 #else 374 #else
375 static __always_inline void * 375 static __always_inline void *
376 kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 376 kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
377 { 377 {
378 return kmalloc_order(size, flags, order); 378 return kmalloc_order(size, flags, order);
379 } 379 }
380 #endif 380 #endif
381 381
382 static __always_inline void *kmalloc_large(size_t size, gfp_t flags) 382 static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
383 { 383 {
384 unsigned int order = get_order(size); 384 unsigned int order = get_order(size);
385 return kmalloc_order_trace(size, flags, order); 385 return kmalloc_order_trace(size, flags, order);
386 } 386 }
387 387
388 /** 388 /**
389 * kmalloc - allocate memory 389 * kmalloc - allocate memory
390 * @size: how many bytes of memory are required. 390 * @size: how many bytes of memory are required.
391 * @flags: the type of memory to allocate. 391 * @flags: the type of memory to allocate.
392 * 392 *
393 * kmalloc is the normal method of allocating memory 393 * kmalloc is the normal method of allocating memory
394 * for objects smaller than page size in the kernel. 394 * for objects smaller than page size in the kernel.
395 * 395 *
396 * The @flags argument may be one of: 396 * The @flags argument may be one of:
397 * 397 *
398 * %GFP_USER - Allocate memory on behalf of user. May sleep. 398 * %GFP_USER - Allocate memory on behalf of user. May sleep.
399 * 399 *
400 * %GFP_KERNEL - Allocate normal kernel ram. May sleep. 400 * %GFP_KERNEL - Allocate normal kernel ram. May sleep.
401 * 401 *
402 * %GFP_ATOMIC - Allocation will not sleep. May use emergency pools. 402 * %GFP_ATOMIC - Allocation will not sleep. May use emergency pools.
403 * For example, use this inside interrupt handlers. 403 * For example, use this inside interrupt handlers.
404 * 404 *
405 * %GFP_HIGHUSER - Allocate pages from high memory. 405 * %GFP_HIGHUSER - Allocate pages from high memory.
406 * 406 *
407 * %GFP_NOIO - Do not do any I/O at all while trying to get memory. 407 * %GFP_NOIO - Do not do any I/O at all while trying to get memory.
408 * 408 *
409 * %GFP_NOFS - Do not make any fs calls while trying to get memory. 409 * %GFP_NOFS - Do not make any fs calls while trying to get memory.
410 * 410 *
411 * %GFP_NOWAIT - Allocation will not sleep. 411 * %GFP_NOWAIT - Allocation will not sleep.
412 * 412 *
413 * %__GFP_THISNODE - Allocate node-local memory only. 413 * %__GFP_THISNODE - Allocate node-local memory only.
414 * 414 *
415 * %GFP_DMA - Allocation suitable for DMA. 415 * %GFP_DMA - Allocation suitable for DMA.
416 * Should only be used for kmalloc() caches. Otherwise, use a 416 * Should only be used for kmalloc() caches. Otherwise, use a
417 * slab created with SLAB_DMA. 417 * slab created with SLAB_DMA.
418 * 418 *
419 * Also it is possible to set different flags by OR'ing 419 * Also it is possible to set different flags by OR'ing
420 * in one or more of the following additional @flags: 420 * in one or more of the following additional @flags:
421 * 421 *
422 * %__GFP_COLD - Request cache-cold pages instead of 422 * %__GFP_COLD - Request cache-cold pages instead of
423 * trying to return cache-warm pages. 423 * trying to return cache-warm pages.
424 * 424 *
425 * %__GFP_HIGH - This allocation has high priority and may use emergency pools. 425 * %__GFP_HIGH - This allocation has high priority and may use emergency pools.
426 * 426 *
427 * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail 427 * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail
428 * (think twice before using). 428 * (think twice before using).
429 * 429 *
430 * %__GFP_NORETRY - If memory is not immediately available, 430 * %__GFP_NORETRY - If memory is not immediately available,
431 * then give up at once. 431 * then give up at once.
432 * 432 *
433 * %__GFP_NOWARN - If allocation fails, don't issue any warnings. 433 * %__GFP_NOWARN - If allocation fails, don't issue any warnings.
434 * 434 *
435 * %__GFP_REPEAT - If allocation fails initially, try once more before failing. 435 * %__GFP_REPEAT - If allocation fails initially, try once more before failing.
436 * 436 *
437 * There are other flags available as well, but these are not intended 437 * There are other flags available as well, but these are not intended
438 * for general use, and so are not documented here. For a full list of 438 * for general use, and so are not documented here. For a full list of
439 * potential flags, always refer to linux/gfp.h. 439 * potential flags, always refer to linux/gfp.h.
440 */ 440 */
441 static __always_inline void *kmalloc(size_t size, gfp_t flags) 441 static __always_inline void *kmalloc(size_t size, gfp_t flags)
442 { 442 {
443 if (__builtin_constant_p(size)) { 443 if (__builtin_constant_p(size)) {
444 if (size > KMALLOC_MAX_CACHE_SIZE) 444 if (size > KMALLOC_MAX_CACHE_SIZE)
445 return kmalloc_large(size, flags); 445 return kmalloc_large(size, flags);
446 #ifndef CONFIG_SLOB 446 #ifndef CONFIG_SLOB
447 if (!(flags & GFP_DMA)) { 447 if (!(flags & GFP_DMA)) {
448 int index = kmalloc_index(size); 448 int index = kmalloc_index(size);
449 449
450 if (!index) 450 if (!index)
451 return ZERO_SIZE_PTR; 451 return ZERO_SIZE_PTR;
452 452
453 return kmem_cache_alloc_trace(kmalloc_caches[index], 453 return kmem_cache_alloc_trace(kmalloc_caches[index],
454 flags, size); 454 flags, size);
455 } 455 }
456 #endif 456 #endif
457 } 457 }
458 return __kmalloc(size, flags); 458 return __kmalloc(size, flags);
459 } 459 }
460 460
461 /* 461 /*
462 * Determine size used for the nth kmalloc cache. 462 * Determine size used for the nth kmalloc cache.
463 * return size or 0 if a kmalloc cache for that 463 * return size or 0 if a kmalloc cache for that
464 * size does not exist 464 * size does not exist
465 */ 465 */
466 static __always_inline int kmalloc_size(int n) 466 static __always_inline int kmalloc_size(int n)
467 { 467 {
468 #ifndef CONFIG_SLOB 468 #ifndef CONFIG_SLOB
469 if (n > 2) 469 if (n > 2)
470 return 1 << n; 470 return 1 << n;
471 471
472 if (n == 1 && KMALLOC_MIN_SIZE <= 32) 472 if (n == 1 && KMALLOC_MIN_SIZE <= 32)
473 return 96; 473 return 96;
474 474
475 if (n == 2 && KMALLOC_MIN_SIZE <= 64) 475 if (n == 2 && KMALLOC_MIN_SIZE <= 64)
476 return 192; 476 return 192;
477 #endif 477 #endif
478 return 0; 478 return 0;
479 } 479 }
480 480
481 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) 481 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
482 { 482 {
483 #ifndef CONFIG_SLOB 483 #ifndef CONFIG_SLOB
484 if (__builtin_constant_p(size) && 484 if (__builtin_constant_p(size) &&
485 size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) { 485 size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
486 int i = kmalloc_index(size); 486 int i = kmalloc_index(size);
487 487
488 if (!i) 488 if (!i)
489 return ZERO_SIZE_PTR; 489 return ZERO_SIZE_PTR;
490 490
491 return kmem_cache_alloc_node_trace(kmalloc_caches[i], 491 return kmem_cache_alloc_node_trace(kmalloc_caches[i],
492 flags, node, size); 492 flags, node, size);
493 } 493 }
494 #endif 494 #endif
495 return __kmalloc_node(size, flags, node); 495 return __kmalloc_node(size, flags, node);
496 } 496 }
497 497
498 /* 498 /*
499 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment. 499 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
500 * Intended for arches that get misalignment faults even for 64 bit integer 500 * Intended for arches that get misalignment faults even for 64 bit integer
501 * aligned buffers. 501 * aligned buffers.
502 */ 502 */
503 #ifndef ARCH_SLAB_MINALIGN 503 #ifndef ARCH_SLAB_MINALIGN
504 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 504 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
505 #endif 505 #endif
506 /* 506 /*
507 * This is the main placeholder for memcg-related information in kmem caches. 507 * This is the main placeholder for memcg-related information in kmem caches.
508 * struct kmem_cache will hold a pointer to it, so the memory cost while 508 * struct kmem_cache will hold a pointer to it, so the memory cost while
509 * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it 509 * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
510 * would otherwise be if that would be bundled in kmem_cache: we'll need an 510 * would otherwise be if that would be bundled in kmem_cache: we'll need an
511 * extra pointer chase. But the trade off clearly lays in favor of not 511 * extra pointer chase. But the trade off clearly lays in favor of not
512 * penalizing non-users. 512 * penalizing non-users.
513 * 513 *
514 * Both the root cache and the child caches will have it. For the root cache, 514 * Both the root cache and the child caches will have it. For the root cache,
515 * this will hold a dynamically allocated array large enough to hold 515 * this will hold a dynamically allocated array large enough to hold
516 * information about the currently limited memcgs in the system. To allow the 516 * information about the currently limited memcgs in the system. To allow the
517 * array to be accessed without taking any locks, on relocation we free the old 517 * array to be accessed without taking any locks, on relocation we free the old
518 * version only after a grace period. 518 * version only after a grace period.
519 * 519 *
520 * Child caches will hold extra metadata needed for its operation. Fields are: 520 * Child caches will hold extra metadata needed for its operation. Fields are:
521 * 521 *
522 * @memcg: pointer to the memcg this cache belongs to 522 * @memcg: pointer to the memcg this cache belongs to
523 * @list: list_head for the list of all caches in this memcg 523 * @list: list_head for the list of all caches in this memcg
524 * @root_cache: pointer to the global, root cache, this cache was derived from 524 * @root_cache: pointer to the global, root cache, this cache was derived from
525 * @dead: set to true after the memcg dies; the cache may still be around. 525 * @dead: set to true after the memcg dies; the cache may still be around.
526 * @nr_pages: number of pages that belongs to this cache. 526 * @nr_pages: number of pages that belongs to this cache.
527 * @destroy: worker to be called whenever we are ready, or believe we may be 527 * @destroy: worker to be called whenever we are ready, or believe we may be
528 * ready, to destroy this cache. 528 * ready, to destroy this cache.
529 */ 529 */
530 struct memcg_cache_params { 530 struct memcg_cache_params {
531 bool is_root_cache; 531 bool is_root_cache;
532 union { 532 union {
533 struct { 533 struct {
534 struct rcu_head rcu_head; 534 struct rcu_head rcu_head;
535 struct kmem_cache *memcg_caches[0]; 535 struct kmem_cache *memcg_caches[0];
536 }; 536 };
537 struct { 537 struct {
538 struct mem_cgroup *memcg; 538 struct mem_cgroup *memcg;
539 struct list_head list; 539 struct list_head list;
540 struct kmem_cache *root_cache; 540 struct kmem_cache *root_cache;
541 bool dead; 541 bool dead;
542 atomic_t nr_pages; 542 atomic_t nr_pages;
543 struct work_struct destroy; 543 struct work_struct destroy;
544 }; 544 };
545 }; 545 };
546 }; 546 };
547 547
548 int memcg_update_all_caches(int num_memcgs); 548 int memcg_update_all_caches(int num_memcgs);
549 549
550 struct seq_file; 550 struct seq_file;
551 int cache_show(struct kmem_cache *s, struct seq_file *m); 551 int cache_show(struct kmem_cache *s, struct seq_file *m);
552 void print_slabinfo_header(struct seq_file *m); 552 void print_slabinfo_header(struct seq_file *m);
553 553
554 /** 554 /**
555 * kmalloc_array - allocate memory for an array. 555 * kmalloc_array - allocate memory for an array.
556 * @n: number of elements. 556 * @n: number of elements.
557 * @size: element size. 557 * @size: element size.
558 * @flags: the type of memory to allocate (see kmalloc). 558 * @flags: the type of memory to allocate (see kmalloc).
559 */ 559 */
560 static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) 560 static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags)
561 { 561 {
562 if (size != 0 && n > SIZE_MAX / size) 562 if (size != 0 && n > SIZE_MAX / size)
563 return NULL; 563 return NULL;
564 return __kmalloc(n * size, flags); 564 return __kmalloc(n * size, flags);
565 } 565 }
566 566
567 /** 567 /**
568 * kcalloc - allocate memory for an array. The memory is set to zero. 568 * kcalloc - allocate memory for an array. The memory is set to zero.
569 * @n: number of elements. 569 * @n: number of elements.
570 * @size: element size. 570 * @size: element size.
571 * @flags: the type of memory to allocate (see kmalloc). 571 * @flags: the type of memory to allocate (see kmalloc).
572 */ 572 */
573 static inline void *kcalloc(size_t n, size_t size, gfp_t flags) 573 static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
574 { 574 {
575 return kmalloc_array(n, size, flags | __GFP_ZERO); 575 return kmalloc_array(n, size, flags | __GFP_ZERO);
576 } 576 }
577 577
578 /* 578 /*
579 * kmalloc_track_caller is a special version of kmalloc that records the 579 * kmalloc_track_caller is a special version of kmalloc that records the
580 * calling function of the routine calling it for slab leak tracking instead 580 * calling function of the routine calling it for slab leak tracking instead
581 * of just the calling function (confusing, eh?). 581 * of just the calling function (confusing, eh?).
582 * It's useful when the call to kmalloc comes from a widely-used standard 582 * It's useful when the call to kmalloc comes from a widely-used standard
583 * allocator where we care about the real place the memory allocation 583 * allocator where we care about the real place the memory allocation
584 * request comes from. 584 * request comes from.
585 */ 585 */
586 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \ 586 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
587 (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \ 587 (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \
588 (defined(CONFIG_SLOB) && defined(CONFIG_TRACING)) 588 (defined(CONFIG_SLOB) && defined(CONFIG_TRACING))
589 extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); 589 extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long);
590 #define kmalloc_track_caller(size, flags) \ 590 #define kmalloc_track_caller(size, flags) \
591 __kmalloc_track_caller(size, flags, _RET_IP_) 591 __kmalloc_track_caller(size, flags, _RET_IP_)
592 #else 592 #else
593 #define kmalloc_track_caller(size, flags) \ 593 #define kmalloc_track_caller(size, flags) \
594 __kmalloc(size, flags) 594 __kmalloc(size, flags)
595 #endif /* DEBUG_SLAB */ 595 #endif /* DEBUG_SLAB */
596 596
597 #ifdef CONFIG_NUMA 597 #ifdef CONFIG_NUMA
598 /* 598 /*
599 * kmalloc_node_track_caller is a special version of kmalloc_node that 599 * kmalloc_node_track_caller is a special version of kmalloc_node that
600 * records the calling function of the routine calling it for slab leak 600 * records the calling function of the routine calling it for slab leak
601 * tracking instead of just the calling function (confusing, eh?). 601 * tracking instead of just the calling function (confusing, eh?).
602 * It's useful when the call to kmalloc_node comes from a widely-used 602 * It's useful when the call to kmalloc_node comes from a widely-used
603 * standard allocator where we care about the real place the memory 603 * standard allocator where we care about the real place the memory
604 * allocation request comes from. 604 * allocation request comes from.
605 */ 605 */
606 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \ 606 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \
607 (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \ 607 (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \
608 (defined(CONFIG_SLOB) && defined(CONFIG_TRACING)) 608 (defined(CONFIG_SLOB) && defined(CONFIG_TRACING))
609 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); 609 extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long);
610 #define kmalloc_node_track_caller(size, flags, node) \ 610 #define kmalloc_node_track_caller(size, flags, node) \
611 __kmalloc_node_track_caller(size, flags, node, \ 611 __kmalloc_node_track_caller(size, flags, node, \
612 _RET_IP_) 612 _RET_IP_)
613 #else 613 #else
614 #define kmalloc_node_track_caller(size, flags, node) \ 614 #define kmalloc_node_track_caller(size, flags, node) \
615 __kmalloc_node(size, flags, node) 615 __kmalloc_node(size, flags, node)
616 #endif 616 #endif
617 617
618 #else /* CONFIG_NUMA */ 618 #else /* CONFIG_NUMA */
619 619
620 #define kmalloc_node_track_caller(size, flags, node) \ 620 #define kmalloc_node_track_caller(size, flags, node) \
621 kmalloc_track_caller(size, flags) 621 kmalloc_track_caller(size, flags)
622 622
623 #endif /* CONFIG_NUMA */ 623 #endif /* CONFIG_NUMA */
624 624
625 /* 625 /*
626 * Shortcuts 626 * Shortcuts
627 */ 627 */
628 static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) 628 static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
629 { 629 {
630 return kmem_cache_alloc(k, flags | __GFP_ZERO); 630 return kmem_cache_alloc(k, flags | __GFP_ZERO);
631 } 631 }
632 632
633 /** 633 /**
634 * kzalloc - allocate memory. The memory is set to zero. 634 * kzalloc - allocate memory. The memory is set to zero.
635 * @size: how many bytes of memory are required. 635 * @size: how many bytes of memory are required.
636 * @flags: the type of memory to allocate (see kmalloc). 636 * @flags: the type of memory to allocate (see kmalloc).
637 */ 637 */
638 static inline void *kzalloc(size_t size, gfp_t flags) 638 static inline void *kzalloc(size_t size, gfp_t flags)
639 { 639 {
640 return kmalloc(size, flags | __GFP_ZERO); 640 return kmalloc(size, flags | __GFP_ZERO);
641 } 641 }
642 642
643 /** 643 /**
644 * kzalloc_node - allocate zeroed memory from a particular memory node. 644 * kzalloc_node - allocate zeroed memory from a particular memory node.
645 * @size: how many bytes of memory are required. 645 * @size: how many bytes of memory are required.
646 * @flags: the type of memory to allocate (see kmalloc). 646 * @flags: the type of memory to allocate (see kmalloc).
647 * @node: memory node from which to allocate 647 * @node: memory node from which to allocate
648 */ 648 */
649 static inline void *kzalloc_node(size_t size, gfp_t flags, int node) 649 static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
650 { 650 {
651 return kmalloc_node(size, flags | __GFP_ZERO, node); 651 return kmalloc_node(size, flags | __GFP_ZERO, node);
652 } 652 }
653 653
654 /* 654 /*
655 * Determine the size of a slab object 655 * Determine the size of a slab object
656 */ 656 */
657 static inline unsigned int kmem_cache_size(struct kmem_cache *s) 657 static inline unsigned int kmem_cache_size(struct kmem_cache *s)
658 { 658 {
659 return s->object_size; 659 return s->object_size;
660 } 660 }
661 661
662 void __init kmem_cache_init_late(void); 662 void __init kmem_cache_init_late(void);
663 663
664 #endif /* _LINUX_SLAB_H */ 664 #endif /* _LINUX_SLAB_H */
665 665
1 /* memcontrol.c - Memory Controller 1 /* memcontrol.c - Memory Controller
2 * 2 *
3 * Copyright IBM Corporation, 2007 3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 * 5 *
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds 9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * Kernel Memory Controller 13 * Kernel Memory Controller
14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 14 * Copyright (C) 2012 Parallels Inc. and Google Inc.
15 * Authors: Glauber Costa and Suleiman Souhlal 15 * Authors: Glauber Costa and Suleiman Souhlal
16 * 16 *
17 * This program is free software; you can redistribute it and/or modify 17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by 18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or 19 * the Free Software Foundation; either version 2 of the License, or
20 * (at your option) any later version. 20 * (at your option) any later version.
21 * 21 *
22 * This program is distributed in the hope that it will be useful, 22 * This program is distributed in the hope that it will be useful,
23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 * GNU General Public License for more details. 25 * GNU General Public License for more details.
26 */ 26 */
27 27
28 #include <linux/res_counter.h> 28 #include <linux/res_counter.h>
29 #include <linux/memcontrol.h> 29 #include <linux/memcontrol.h>
30 #include <linux/cgroup.h> 30 #include <linux/cgroup.h>
31 #include <linux/mm.h> 31 #include <linux/mm.h>
32 #include <linux/hugetlb.h> 32 #include <linux/hugetlb.h>
33 #include <linux/pagemap.h> 33 #include <linux/pagemap.h>
34 #include <linux/smp.h> 34 #include <linux/smp.h>
35 #include <linux/page-flags.h> 35 #include <linux/page-flags.h>
36 #include <linux/backing-dev.h> 36 #include <linux/backing-dev.h>
37 #include <linux/bit_spinlock.h> 37 #include <linux/bit_spinlock.h>
38 #include <linux/rcupdate.h> 38 #include <linux/rcupdate.h>
39 #include <linux/limits.h> 39 #include <linux/limits.h>
40 #include <linux/export.h> 40 #include <linux/export.h>
41 #include <linux/mutex.h> 41 #include <linux/mutex.h>
42 #include <linux/rbtree.h> 42 #include <linux/rbtree.h>
43 #include <linux/slab.h> 43 #include <linux/slab.h>
44 #include <linux/swap.h> 44 #include <linux/swap.h>
45 #include <linux/swapops.h> 45 #include <linux/swapops.h>
46 #include <linux/spinlock.h> 46 #include <linux/spinlock.h>
47 #include <linux/eventfd.h> 47 #include <linux/eventfd.h>
48 #include <linux/poll.h> 48 #include <linux/poll.h>
49 #include <linux/sort.h> 49 #include <linux/sort.h>
50 #include <linux/fs.h> 50 #include <linux/fs.h>
51 #include <linux/seq_file.h> 51 #include <linux/seq_file.h>
52 #include <linux/vmpressure.h> 52 #include <linux/vmpressure.h>
53 #include <linux/mm_inline.h> 53 #include <linux/mm_inline.h>
54 #include <linux/page_cgroup.h> 54 #include <linux/page_cgroup.h>
55 #include <linux/cpu.h> 55 #include <linux/cpu.h>
56 #include <linux/oom.h> 56 #include <linux/oom.h>
57 #include <linux/lockdep.h> 57 #include <linux/lockdep.h>
58 #include <linux/file.h> 58 #include <linux/file.h>
59 #include "internal.h" 59 #include "internal.h"
60 #include <net/sock.h> 60 #include <net/sock.h>
61 #include <net/ip.h> 61 #include <net/ip.h>
62 #include <net/tcp_memcontrol.h> 62 #include <net/tcp_memcontrol.h>
63 #include "slab.h" 63 #include "slab.h"
64 64
65 #include <asm/uaccess.h> 65 #include <asm/uaccess.h>
66 66
67 #include <trace/events/vmscan.h> 67 #include <trace/events/vmscan.h>
68 68
69 struct cgroup_subsys memory_cgrp_subsys __read_mostly; 69 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
70 EXPORT_SYMBOL(memory_cgrp_subsys); 70 EXPORT_SYMBOL(memory_cgrp_subsys);
71 71
72 #define MEM_CGROUP_RECLAIM_RETRIES 5 72 #define MEM_CGROUP_RECLAIM_RETRIES 5
73 static struct mem_cgroup *root_mem_cgroup __read_mostly; 73 static struct mem_cgroup *root_mem_cgroup __read_mostly;
74 74
75 #ifdef CONFIG_MEMCG_SWAP 75 #ifdef CONFIG_MEMCG_SWAP
76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 76 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
77 int do_swap_account __read_mostly; 77 int do_swap_account __read_mostly;
78 78
79 /* for remember boot option*/ 79 /* for remember boot option*/
80 #ifdef CONFIG_MEMCG_SWAP_ENABLED 80 #ifdef CONFIG_MEMCG_SWAP_ENABLED
81 static int really_do_swap_account __initdata = 1; 81 static int really_do_swap_account __initdata = 1;
82 #else 82 #else
83 static int really_do_swap_account __initdata = 0; 83 static int really_do_swap_account __initdata = 0;
84 #endif 84 #endif
85 85
86 #else 86 #else
87 #define do_swap_account 0 87 #define do_swap_account 0
88 #endif 88 #endif
89 89
90 90
91 static const char * const mem_cgroup_stat_names[] = { 91 static const char * const mem_cgroup_stat_names[] = {
92 "cache", 92 "cache",
93 "rss", 93 "rss",
94 "rss_huge", 94 "rss_huge",
95 "mapped_file", 95 "mapped_file",
96 "writeback", 96 "writeback",
97 "swap", 97 "swap",
98 }; 98 };
99 99
100 enum mem_cgroup_events_index { 100 enum mem_cgroup_events_index {
101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 101 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 102 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 103 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 104 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
105 MEM_CGROUP_EVENTS_NSTATS, 105 MEM_CGROUP_EVENTS_NSTATS,
106 }; 106 };
107 107
108 static const char * const mem_cgroup_events_names[] = { 108 static const char * const mem_cgroup_events_names[] = {
109 "pgpgin", 109 "pgpgin",
110 "pgpgout", 110 "pgpgout",
111 "pgfault", 111 "pgfault",
112 "pgmajfault", 112 "pgmajfault",
113 }; 113 };
114 114
115 static const char * const mem_cgroup_lru_names[] = { 115 static const char * const mem_cgroup_lru_names[] = {
116 "inactive_anon", 116 "inactive_anon",
117 "active_anon", 117 "active_anon",
118 "inactive_file", 118 "inactive_file",
119 "active_file", 119 "active_file",
120 "unevictable", 120 "unevictable",
121 }; 121 };
122 122
123 /* 123 /*
124 * Per memcg event counter is incremented at every pagein/pageout. With THP, 124 * Per memcg event counter is incremented at every pagein/pageout. With THP,
125 * it will be incremated by the number of pages. This counter is used for 125 * it will be incremated by the number of pages. This counter is used for
126 * for trigger some periodic events. This is straightforward and better 126 * for trigger some periodic events. This is straightforward and better
127 * than using jiffies etc. to handle periodic memcg event. 127 * than using jiffies etc. to handle periodic memcg event.
128 */ 128 */
129 enum mem_cgroup_events_target { 129 enum mem_cgroup_events_target {
130 MEM_CGROUP_TARGET_THRESH, 130 MEM_CGROUP_TARGET_THRESH,
131 MEM_CGROUP_TARGET_SOFTLIMIT, 131 MEM_CGROUP_TARGET_SOFTLIMIT,
132 MEM_CGROUP_TARGET_NUMAINFO, 132 MEM_CGROUP_TARGET_NUMAINFO,
133 MEM_CGROUP_NTARGETS, 133 MEM_CGROUP_NTARGETS,
134 }; 134 };
135 #define THRESHOLDS_EVENTS_TARGET 128 135 #define THRESHOLDS_EVENTS_TARGET 128
136 #define SOFTLIMIT_EVENTS_TARGET 1024 136 #define SOFTLIMIT_EVENTS_TARGET 1024
137 #define NUMAINFO_EVENTS_TARGET 1024 137 #define NUMAINFO_EVENTS_TARGET 1024
138 138
139 struct mem_cgroup_stat_cpu { 139 struct mem_cgroup_stat_cpu {
140 long count[MEM_CGROUP_STAT_NSTATS]; 140 long count[MEM_CGROUP_STAT_NSTATS];
141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 141 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
142 unsigned long nr_page_events; 142 unsigned long nr_page_events;
143 unsigned long targets[MEM_CGROUP_NTARGETS]; 143 unsigned long targets[MEM_CGROUP_NTARGETS];
144 }; 144 };
145 145
146 struct mem_cgroup_reclaim_iter { 146 struct mem_cgroup_reclaim_iter {
147 /* 147 /*
148 * last scanned hierarchy member. Valid only if last_dead_count 148 * last scanned hierarchy member. Valid only if last_dead_count
149 * matches memcg->dead_count of the hierarchy root group. 149 * matches memcg->dead_count of the hierarchy root group.
150 */ 150 */
151 struct mem_cgroup *last_visited; 151 struct mem_cgroup *last_visited;
152 int last_dead_count; 152 int last_dead_count;
153 153
154 /* scan generation, increased every round-trip */ 154 /* scan generation, increased every round-trip */
155 unsigned int generation; 155 unsigned int generation;
156 }; 156 };
157 157
158 /* 158 /*
159 * per-zone information in memory controller. 159 * per-zone information in memory controller.
160 */ 160 */
161 struct mem_cgroup_per_zone { 161 struct mem_cgroup_per_zone {
162 struct lruvec lruvec; 162 struct lruvec lruvec;
163 unsigned long lru_size[NR_LRU_LISTS]; 163 unsigned long lru_size[NR_LRU_LISTS];
164 164
165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 165 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
166 166
167 struct rb_node tree_node; /* RB tree node */ 167 struct rb_node tree_node; /* RB tree node */
168 unsigned long long usage_in_excess;/* Set to the value by which */ 168 unsigned long long usage_in_excess;/* Set to the value by which */
169 /* the soft limit is exceeded*/ 169 /* the soft limit is exceeded*/
170 bool on_tree; 170 bool on_tree;
171 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 171 struct mem_cgroup *memcg; /* Back pointer, we cannot */
172 /* use container_of */ 172 /* use container_of */
173 }; 173 };
174 174
175 struct mem_cgroup_per_node { 175 struct mem_cgroup_per_node {
176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 176 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
177 }; 177 };
178 178
179 /* 179 /*
180 * Cgroups above their limits are maintained in a RB-Tree, independent of 180 * Cgroups above their limits are maintained in a RB-Tree, independent of
181 * their hierarchy representation 181 * their hierarchy representation
182 */ 182 */
183 183
184 struct mem_cgroup_tree_per_zone { 184 struct mem_cgroup_tree_per_zone {
185 struct rb_root rb_root; 185 struct rb_root rb_root;
186 spinlock_t lock; 186 spinlock_t lock;
187 }; 187 };
188 188
189 struct mem_cgroup_tree_per_node { 189 struct mem_cgroup_tree_per_node {
190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 190 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
191 }; 191 };
192 192
193 struct mem_cgroup_tree { 193 struct mem_cgroup_tree {
194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 194 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
195 }; 195 };
196 196
197 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 197 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
198 198
199 struct mem_cgroup_threshold { 199 struct mem_cgroup_threshold {
200 struct eventfd_ctx *eventfd; 200 struct eventfd_ctx *eventfd;
201 u64 threshold; 201 u64 threshold;
202 }; 202 };
203 203
204 /* For threshold */ 204 /* For threshold */
205 struct mem_cgroup_threshold_ary { 205 struct mem_cgroup_threshold_ary {
206 /* An array index points to threshold just below or equal to usage. */ 206 /* An array index points to threshold just below or equal to usage. */
207 int current_threshold; 207 int current_threshold;
208 /* Size of entries[] */ 208 /* Size of entries[] */
209 unsigned int size; 209 unsigned int size;
210 /* Array of thresholds */ 210 /* Array of thresholds */
211 struct mem_cgroup_threshold entries[0]; 211 struct mem_cgroup_threshold entries[0];
212 }; 212 };
213 213
214 struct mem_cgroup_thresholds { 214 struct mem_cgroup_thresholds {
215 /* Primary thresholds array */ 215 /* Primary thresholds array */
216 struct mem_cgroup_threshold_ary *primary; 216 struct mem_cgroup_threshold_ary *primary;
217 /* 217 /*
218 * Spare threshold array. 218 * Spare threshold array.
219 * This is needed to make mem_cgroup_unregister_event() "never fail". 219 * This is needed to make mem_cgroup_unregister_event() "never fail".
220 * It must be able to store at least primary->size - 1 entries. 220 * It must be able to store at least primary->size - 1 entries.
221 */ 221 */
222 struct mem_cgroup_threshold_ary *spare; 222 struct mem_cgroup_threshold_ary *spare;
223 }; 223 };
224 224
225 /* for OOM */ 225 /* for OOM */
226 struct mem_cgroup_eventfd_list { 226 struct mem_cgroup_eventfd_list {
227 struct list_head list; 227 struct list_head list;
228 struct eventfd_ctx *eventfd; 228 struct eventfd_ctx *eventfd;
229 }; 229 };
230 230
231 /* 231 /*
232 * cgroup_event represents events which userspace want to receive. 232 * cgroup_event represents events which userspace want to receive.
233 */ 233 */
234 struct mem_cgroup_event { 234 struct mem_cgroup_event {
235 /* 235 /*
236 * memcg which the event belongs to. 236 * memcg which the event belongs to.
237 */ 237 */
238 struct mem_cgroup *memcg; 238 struct mem_cgroup *memcg;
239 /* 239 /*
240 * eventfd to signal userspace about the event. 240 * eventfd to signal userspace about the event.
241 */ 241 */
242 struct eventfd_ctx *eventfd; 242 struct eventfd_ctx *eventfd;
243 /* 243 /*
244 * Each of these stored in a list by the cgroup. 244 * Each of these stored in a list by the cgroup.
245 */ 245 */
246 struct list_head list; 246 struct list_head list;
247 /* 247 /*
248 * register_event() callback will be used to add new userspace 248 * register_event() callback will be used to add new userspace
249 * waiter for changes related to this event. Use eventfd_signal() 249 * waiter for changes related to this event. Use eventfd_signal()
250 * on eventfd to send notification to userspace. 250 * on eventfd to send notification to userspace.
251 */ 251 */
252 int (*register_event)(struct mem_cgroup *memcg, 252 int (*register_event)(struct mem_cgroup *memcg,
253 struct eventfd_ctx *eventfd, const char *args); 253 struct eventfd_ctx *eventfd, const char *args);
254 /* 254 /*
255 * unregister_event() callback will be called when userspace closes 255 * unregister_event() callback will be called when userspace closes
256 * the eventfd or on cgroup removing. This callback must be set, 256 * the eventfd or on cgroup removing. This callback must be set,
257 * if you want provide notification functionality. 257 * if you want provide notification functionality.
258 */ 258 */
259 void (*unregister_event)(struct mem_cgroup *memcg, 259 void (*unregister_event)(struct mem_cgroup *memcg,
260 struct eventfd_ctx *eventfd); 260 struct eventfd_ctx *eventfd);
261 /* 261 /*
262 * All fields below needed to unregister event when 262 * All fields below needed to unregister event when
263 * userspace closes eventfd. 263 * userspace closes eventfd.
264 */ 264 */
265 poll_table pt; 265 poll_table pt;
266 wait_queue_head_t *wqh; 266 wait_queue_head_t *wqh;
267 wait_queue_t wait; 267 wait_queue_t wait;
268 struct work_struct remove; 268 struct work_struct remove;
269 }; 269 };
270 270
271 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 271 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
272 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 272 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
273 273
274 /* 274 /*
275 * The memory controller data structure. The memory controller controls both 275 * The memory controller data structure. The memory controller controls both
276 * page cache and RSS per cgroup. We would eventually like to provide 276 * page cache and RSS per cgroup. We would eventually like to provide
277 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 277 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
278 * to help the administrator determine what knobs to tune. 278 * to help the administrator determine what knobs to tune.
279 * 279 *
280 * TODO: Add a water mark for the memory controller. Reclaim will begin when 280 * TODO: Add a water mark for the memory controller. Reclaim will begin when
281 * we hit the water mark. May be even add a low water mark, such that 281 * we hit the water mark. May be even add a low water mark, such that
282 * no reclaim occurs from a cgroup at it's low water mark, this is 282 * no reclaim occurs from a cgroup at it's low water mark, this is
283 * a feature that will be implemented much later in the future. 283 * a feature that will be implemented much later in the future.
284 */ 284 */
285 struct mem_cgroup { 285 struct mem_cgroup {
286 struct cgroup_subsys_state css; 286 struct cgroup_subsys_state css;
287 /* 287 /*
288 * the counter to account for memory usage 288 * the counter to account for memory usage
289 */ 289 */
290 struct res_counter res; 290 struct res_counter res;
291 291
292 /* vmpressure notifications */ 292 /* vmpressure notifications */
293 struct vmpressure vmpressure; 293 struct vmpressure vmpressure;
294 294
295 /* 295 /*
296 * the counter to account for mem+swap usage. 296 * the counter to account for mem+swap usage.
297 */ 297 */
298 struct res_counter memsw; 298 struct res_counter memsw;
299 299
300 /* 300 /*
301 * the counter to account for kernel memory usage. 301 * the counter to account for kernel memory usage.
302 */ 302 */
303 struct res_counter kmem; 303 struct res_counter kmem;
304 /* 304 /*
305 * Should the accounting and control be hierarchical, per subtree? 305 * Should the accounting and control be hierarchical, per subtree?
306 */ 306 */
307 bool use_hierarchy; 307 bool use_hierarchy;
308 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 308 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
309 309
310 bool oom_lock; 310 bool oom_lock;
311 atomic_t under_oom; 311 atomic_t under_oom;
312 atomic_t oom_wakeups; 312 atomic_t oom_wakeups;
313 313
314 int swappiness; 314 int swappiness;
315 /* OOM-Killer disable */ 315 /* OOM-Killer disable */
316 int oom_kill_disable; 316 int oom_kill_disable;
317 317
318 /* set when res.limit == memsw.limit */ 318 /* set when res.limit == memsw.limit */
319 bool memsw_is_minimum; 319 bool memsw_is_minimum;
320 320
321 /* protect arrays of thresholds */ 321 /* protect arrays of thresholds */
322 struct mutex thresholds_lock; 322 struct mutex thresholds_lock;
323 323
324 /* thresholds for memory usage. RCU-protected */ 324 /* thresholds for memory usage. RCU-protected */
325 struct mem_cgroup_thresholds thresholds; 325 struct mem_cgroup_thresholds thresholds;
326 326
327 /* thresholds for mem+swap usage. RCU-protected */ 327 /* thresholds for mem+swap usage. RCU-protected */
328 struct mem_cgroup_thresholds memsw_thresholds; 328 struct mem_cgroup_thresholds memsw_thresholds;
329 329
330 /* For oom notifier event fd */ 330 /* For oom notifier event fd */
331 struct list_head oom_notify; 331 struct list_head oom_notify;
332 332
333 /* 333 /*
334 * Should we move charges of a task when a task is moved into this 334 * Should we move charges of a task when a task is moved into this
335 * mem_cgroup ? And what type of charges should we move ? 335 * mem_cgroup ? And what type of charges should we move ?
336 */ 336 */
337 unsigned long move_charge_at_immigrate; 337 unsigned long move_charge_at_immigrate;
338 /* 338 /*
339 * set > 0 if pages under this cgroup are moving to other cgroup. 339 * set > 0 if pages under this cgroup are moving to other cgroup.
340 */ 340 */
341 atomic_t moving_account; 341 atomic_t moving_account;
342 /* taken only while moving_account > 0 */ 342 /* taken only while moving_account > 0 */
343 spinlock_t move_lock; 343 spinlock_t move_lock;
344 /* 344 /*
345 * percpu counter. 345 * percpu counter.
346 */ 346 */
347 struct mem_cgroup_stat_cpu __percpu *stat; 347 struct mem_cgroup_stat_cpu __percpu *stat;
348 /* 348 /*
349 * used when a cpu is offlined or other synchronizations 349 * used when a cpu is offlined or other synchronizations
350 * See mem_cgroup_read_stat(). 350 * See mem_cgroup_read_stat().
351 */ 351 */
352 struct mem_cgroup_stat_cpu nocpu_base; 352 struct mem_cgroup_stat_cpu nocpu_base;
353 spinlock_t pcp_counter_lock; 353 spinlock_t pcp_counter_lock;
354 354
355 atomic_t dead_count; 355 atomic_t dead_count;
356 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 356 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
357 struct cg_proto tcp_mem; 357 struct cg_proto tcp_mem;
358 #endif 358 #endif
359 #if defined(CONFIG_MEMCG_KMEM) 359 #if defined(CONFIG_MEMCG_KMEM)
360 /* analogous to slab_common's slab_caches list. per-memcg */ 360 /* analogous to slab_common's slab_caches list. per-memcg */
361 struct list_head memcg_slab_caches; 361 struct list_head memcg_slab_caches;
362 /* Not a spinlock, we can take a lot of time walking the list */ 362 /* Not a spinlock, we can take a lot of time walking the list */
363 struct mutex slab_caches_mutex; 363 struct mutex slab_caches_mutex;
364 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 364 /* Index in the kmem_cache->memcg_params->memcg_caches array */
365 int kmemcg_id; 365 int kmemcg_id;
366 #endif 366 #endif
367 367
368 int last_scanned_node; 368 int last_scanned_node;
369 #if MAX_NUMNODES > 1 369 #if MAX_NUMNODES > 1
370 nodemask_t scan_nodes; 370 nodemask_t scan_nodes;
371 atomic_t numainfo_events; 371 atomic_t numainfo_events;
372 atomic_t numainfo_updating; 372 atomic_t numainfo_updating;
373 #endif 373 #endif
374 374
375 /* List of events which userspace want to receive */ 375 /* List of events which userspace want to receive */
376 struct list_head event_list; 376 struct list_head event_list;
377 spinlock_t event_list_lock; 377 spinlock_t event_list_lock;
378 378
379 struct mem_cgroup_per_node *nodeinfo[0]; 379 struct mem_cgroup_per_node *nodeinfo[0];
380 /* WARNING: nodeinfo must be the last member here */ 380 /* WARNING: nodeinfo must be the last member here */
381 }; 381 };
382 382
383 /* internal only representation about the status of kmem accounting. */ 383 /* internal only representation about the status of kmem accounting. */
384 enum { 384 enum {
385 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */ 385 KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
386 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 386 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
387 }; 387 };
388 388
389 #ifdef CONFIG_MEMCG_KMEM 389 #ifdef CONFIG_MEMCG_KMEM
390 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 390 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
391 { 391 {
392 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 392 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
393 } 393 }
394 394
395 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 395 static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
396 { 396 {
397 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 397 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
398 } 398 }
399 399
400 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 400 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
401 { 401 {
402 /* 402 /*
403 * Our caller must use css_get() first, because memcg_uncharge_kmem() 403 * Our caller must use css_get() first, because memcg_uncharge_kmem()
404 * will call css_put() if it sees the memcg is dead. 404 * will call css_put() if it sees the memcg is dead.
405 */ 405 */
406 smp_wmb(); 406 smp_wmb();
407 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 407 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
408 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 408 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
409 } 409 }
410 410
411 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 411 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
412 { 412 {
413 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 413 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
414 &memcg->kmem_account_flags); 414 &memcg->kmem_account_flags);
415 } 415 }
416 #endif 416 #endif
417 417
418 /* Stuffs for move charges at task migration. */ 418 /* Stuffs for move charges at task migration. */
419 /* 419 /*
420 * Types of charges to be moved. "move_charge_at_immitgrate" and 420 * Types of charges to be moved. "move_charge_at_immitgrate" and
421 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 421 * "immigrate_flags" are treated as a left-shifted bitmap of these types.
422 */ 422 */
423 enum move_type { 423 enum move_type {
424 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 424 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
425 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 425 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
426 NR_MOVE_TYPE, 426 NR_MOVE_TYPE,
427 }; 427 };
428 428
429 /* "mc" and its members are protected by cgroup_mutex */ 429 /* "mc" and its members are protected by cgroup_mutex */
430 static struct move_charge_struct { 430 static struct move_charge_struct {
431 spinlock_t lock; /* for from, to */ 431 spinlock_t lock; /* for from, to */
432 struct mem_cgroup *from; 432 struct mem_cgroup *from;
433 struct mem_cgroup *to; 433 struct mem_cgroup *to;
434 unsigned long immigrate_flags; 434 unsigned long immigrate_flags;
435 unsigned long precharge; 435 unsigned long precharge;
436 unsigned long moved_charge; 436 unsigned long moved_charge;
437 unsigned long moved_swap; 437 unsigned long moved_swap;
438 struct task_struct *moving_task; /* a task moving charges */ 438 struct task_struct *moving_task; /* a task moving charges */
439 wait_queue_head_t waitq; /* a waitq for other context */ 439 wait_queue_head_t waitq; /* a waitq for other context */
440 } mc = { 440 } mc = {
441 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 441 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
442 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 442 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
443 }; 443 };
444 444
445 static bool move_anon(void) 445 static bool move_anon(void)
446 { 446 {
447 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 447 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
448 } 448 }
449 449
450 static bool move_file(void) 450 static bool move_file(void)
451 { 451 {
452 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 452 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
453 } 453 }
454 454
455 /* 455 /*
456 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 456 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
457 * limit reclaim to prevent infinite loops, if they ever occur. 457 * limit reclaim to prevent infinite loops, if they ever occur.
458 */ 458 */
459 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 459 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
460 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 460 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
461 461
462 enum charge_type { 462 enum charge_type {
463 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 463 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
464 MEM_CGROUP_CHARGE_TYPE_ANON, 464 MEM_CGROUP_CHARGE_TYPE_ANON,
465 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 465 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
466 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 466 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
467 NR_CHARGE_TYPE, 467 NR_CHARGE_TYPE,
468 }; 468 };
469 469
470 /* for encoding cft->private value on file */ 470 /* for encoding cft->private value on file */
471 enum res_type { 471 enum res_type {
472 _MEM, 472 _MEM,
473 _MEMSWAP, 473 _MEMSWAP,
474 _OOM_TYPE, 474 _OOM_TYPE,
475 _KMEM, 475 _KMEM,
476 }; 476 };
477 477
478 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 478 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
479 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 479 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
480 #define MEMFILE_ATTR(val) ((val) & 0xffff) 480 #define MEMFILE_ATTR(val) ((val) & 0xffff)
481 /* Used for OOM nofiier */ 481 /* Used for OOM nofiier */
482 #define OOM_CONTROL (0) 482 #define OOM_CONTROL (0)
483 483
484 /* 484 /*
485 * Reclaim flags for mem_cgroup_hierarchical_reclaim 485 * Reclaim flags for mem_cgroup_hierarchical_reclaim
486 */ 486 */
487 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 487 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
488 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 488 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
489 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 489 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
490 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 490 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
491 491
492 /* 492 /*
493 * The memcg_create_mutex will be held whenever a new cgroup is created. 493 * The memcg_create_mutex will be held whenever a new cgroup is created.
494 * As a consequence, any change that needs to protect against new child cgroups 494 * As a consequence, any change that needs to protect against new child cgroups
495 * appearing has to hold it as well. 495 * appearing has to hold it as well.
496 */ 496 */
497 static DEFINE_MUTEX(memcg_create_mutex); 497 static DEFINE_MUTEX(memcg_create_mutex);
498 498
499 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 499 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
500 { 500 {
501 return s ? container_of(s, struct mem_cgroup, css) : NULL; 501 return s ? container_of(s, struct mem_cgroup, css) : NULL;
502 } 502 }
503 503
504 /* Some nice accessors for the vmpressure. */ 504 /* Some nice accessors for the vmpressure. */
505 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 505 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
506 { 506 {
507 if (!memcg) 507 if (!memcg)
508 memcg = root_mem_cgroup; 508 memcg = root_mem_cgroup;
509 return &memcg->vmpressure; 509 return &memcg->vmpressure;
510 } 510 }
511 511
512 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 512 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
513 { 513 {
514 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 514 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
515 } 515 }
516 516
517 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 517 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
518 { 518 {
519 return (memcg == root_mem_cgroup); 519 return (memcg == root_mem_cgroup);
520 } 520 }
521 521
522 /* 522 /*
523 * We restrict the id in the range of [1, 65535], so it can fit into 523 * We restrict the id in the range of [1, 65535], so it can fit into
524 * an unsigned short. 524 * an unsigned short.
525 */ 525 */
526 #define MEM_CGROUP_ID_MAX USHRT_MAX 526 #define MEM_CGROUP_ID_MAX USHRT_MAX
527 527
528 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 528 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
529 { 529 {
530 /* 530 /*
531 * The ID of the root cgroup is 0, but memcg treat 0 as an 531 * The ID of the root cgroup is 0, but memcg treat 0 as an
532 * invalid ID, so we return (cgroup_id + 1). 532 * invalid ID, so we return (cgroup_id + 1).
533 */ 533 */
534 return memcg->css.cgroup->id + 1; 534 return memcg->css.cgroup->id + 1;
535 } 535 }
536 536
537 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 537 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
538 { 538 {
539 struct cgroup_subsys_state *css; 539 struct cgroup_subsys_state *css;
540 540
541 css = css_from_id(id - 1, &memory_cgrp_subsys); 541 css = css_from_id(id - 1, &memory_cgrp_subsys);
542 return mem_cgroup_from_css(css); 542 return mem_cgroup_from_css(css);
543 } 543 }
544 544
545 /* Writing them here to avoid exposing memcg's inner layout */ 545 /* Writing them here to avoid exposing memcg's inner layout */
546 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 546 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
547 547
548 void sock_update_memcg(struct sock *sk) 548 void sock_update_memcg(struct sock *sk)
549 { 549 {
550 if (mem_cgroup_sockets_enabled) { 550 if (mem_cgroup_sockets_enabled) {
551 struct mem_cgroup *memcg; 551 struct mem_cgroup *memcg;
552 struct cg_proto *cg_proto; 552 struct cg_proto *cg_proto;
553 553
554 BUG_ON(!sk->sk_prot->proto_cgroup); 554 BUG_ON(!sk->sk_prot->proto_cgroup);
555 555
556 /* Socket cloning can throw us here with sk_cgrp already 556 /* Socket cloning can throw us here with sk_cgrp already
557 * filled. It won't however, necessarily happen from 557 * filled. It won't however, necessarily happen from
558 * process context. So the test for root memcg given 558 * process context. So the test for root memcg given
559 * the current task's memcg won't help us in this case. 559 * the current task's memcg won't help us in this case.
560 * 560 *
561 * Respecting the original socket's memcg is a better 561 * Respecting the original socket's memcg is a better
562 * decision in this case. 562 * decision in this case.
563 */ 563 */
564 if (sk->sk_cgrp) { 564 if (sk->sk_cgrp) {
565 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 565 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
566 css_get(&sk->sk_cgrp->memcg->css); 566 css_get(&sk->sk_cgrp->memcg->css);
567 return; 567 return;
568 } 568 }
569 569
570 rcu_read_lock(); 570 rcu_read_lock();
571 memcg = mem_cgroup_from_task(current); 571 memcg = mem_cgroup_from_task(current);
572 cg_proto = sk->sk_prot->proto_cgroup(memcg); 572 cg_proto = sk->sk_prot->proto_cgroup(memcg);
573 if (!mem_cgroup_is_root(memcg) && 573 if (!mem_cgroup_is_root(memcg) &&
574 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { 574 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
575 sk->sk_cgrp = cg_proto; 575 sk->sk_cgrp = cg_proto;
576 } 576 }
577 rcu_read_unlock(); 577 rcu_read_unlock();
578 } 578 }
579 } 579 }
580 EXPORT_SYMBOL(sock_update_memcg); 580 EXPORT_SYMBOL(sock_update_memcg);
581 581
582 void sock_release_memcg(struct sock *sk) 582 void sock_release_memcg(struct sock *sk)
583 { 583 {
584 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 584 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
585 struct mem_cgroup *memcg; 585 struct mem_cgroup *memcg;
586 WARN_ON(!sk->sk_cgrp->memcg); 586 WARN_ON(!sk->sk_cgrp->memcg);
587 memcg = sk->sk_cgrp->memcg; 587 memcg = sk->sk_cgrp->memcg;
588 css_put(&sk->sk_cgrp->memcg->css); 588 css_put(&sk->sk_cgrp->memcg->css);
589 } 589 }
590 } 590 }
591 591
592 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 592 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
593 { 593 {
594 if (!memcg || mem_cgroup_is_root(memcg)) 594 if (!memcg || mem_cgroup_is_root(memcg))
595 return NULL; 595 return NULL;
596 596
597 return &memcg->tcp_mem; 597 return &memcg->tcp_mem;
598 } 598 }
599 EXPORT_SYMBOL(tcp_proto_cgroup); 599 EXPORT_SYMBOL(tcp_proto_cgroup);
600 600
601 static void disarm_sock_keys(struct mem_cgroup *memcg) 601 static void disarm_sock_keys(struct mem_cgroup *memcg)
602 { 602 {
603 if (!memcg_proto_activated(&memcg->tcp_mem)) 603 if (!memcg_proto_activated(&memcg->tcp_mem))
604 return; 604 return;
605 static_key_slow_dec(&memcg_socket_limit_enabled); 605 static_key_slow_dec(&memcg_socket_limit_enabled);
606 } 606 }
607 #else 607 #else
608 static void disarm_sock_keys(struct mem_cgroup *memcg) 608 static void disarm_sock_keys(struct mem_cgroup *memcg)
609 { 609 {
610 } 610 }
611 #endif 611 #endif
612 612
613 #ifdef CONFIG_MEMCG_KMEM 613 #ifdef CONFIG_MEMCG_KMEM
614 /* 614 /*
615 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 615 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
616 * The main reason for not using cgroup id for this: 616 * The main reason for not using cgroup id for this:
617 * this works better in sparse environments, where we have a lot of memcgs, 617 * this works better in sparse environments, where we have a lot of memcgs,
618 * but only a few kmem-limited. Or also, if we have, for instance, 200 618 * but only a few kmem-limited. Or also, if we have, for instance, 200
619 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 619 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
620 * 200 entry array for that. 620 * 200 entry array for that.
621 * 621 *
622 * The current size of the caches array is stored in 622 * The current size of the caches array is stored in
623 * memcg_limited_groups_array_size. It will double each time we have to 623 * memcg_limited_groups_array_size. It will double each time we have to
624 * increase it. 624 * increase it.
625 */ 625 */
626 static DEFINE_IDA(kmem_limited_groups); 626 static DEFINE_IDA(kmem_limited_groups);
627 int memcg_limited_groups_array_size; 627 int memcg_limited_groups_array_size;
628 628
629 /* 629 /*
630 * MIN_SIZE is different than 1, because we would like to avoid going through 630 * MIN_SIZE is different than 1, because we would like to avoid going through
631 * the alloc/free process all the time. In a small machine, 4 kmem-limited 631 * the alloc/free process all the time. In a small machine, 4 kmem-limited
632 * cgroups is a reasonable guess. In the future, it could be a parameter or 632 * cgroups is a reasonable guess. In the future, it could be a parameter or
633 * tunable, but that is strictly not necessary. 633 * tunable, but that is strictly not necessary.
634 * 634 *
635 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 635 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
636 * this constant directly from cgroup, but it is understandable that this is 636 * this constant directly from cgroup, but it is understandable that this is
637 * better kept as an internal representation in cgroup.c. In any case, the 637 * better kept as an internal representation in cgroup.c. In any case, the
638 * cgrp_id space is not getting any smaller, and we don't have to necessarily 638 * cgrp_id space is not getting any smaller, and we don't have to necessarily
639 * increase ours as well if it increases. 639 * increase ours as well if it increases.
640 */ 640 */
641 #define MEMCG_CACHES_MIN_SIZE 4 641 #define MEMCG_CACHES_MIN_SIZE 4
642 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 642 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
643 643
644 /* 644 /*
645 * A lot of the calls to the cache allocation functions are expected to be 645 * A lot of the calls to the cache allocation functions are expected to be
646 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 646 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
647 * conditional to this static branch, we'll have to allow modules that does 647 * conditional to this static branch, we'll have to allow modules that does
648 * kmem_cache_alloc and the such to see this symbol as well 648 * kmem_cache_alloc and the such to see this symbol as well
649 */ 649 */
650 struct static_key memcg_kmem_enabled_key; 650 struct static_key memcg_kmem_enabled_key;
651 EXPORT_SYMBOL(memcg_kmem_enabled_key); 651 EXPORT_SYMBOL(memcg_kmem_enabled_key);
652 652
653 static void disarm_kmem_keys(struct mem_cgroup *memcg) 653 static void disarm_kmem_keys(struct mem_cgroup *memcg)
654 { 654 {
655 if (memcg_kmem_is_active(memcg)) { 655 if (memcg_kmem_is_active(memcg)) {
656 static_key_slow_dec(&memcg_kmem_enabled_key); 656 static_key_slow_dec(&memcg_kmem_enabled_key);
657 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 657 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
658 } 658 }
659 /* 659 /*
660 * This check can't live in kmem destruction function, 660 * This check can't live in kmem destruction function,
661 * since the charges will outlive the cgroup 661 * since the charges will outlive the cgroup
662 */ 662 */
663 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 663 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
664 } 664 }
665 #else 665 #else
666 static void disarm_kmem_keys(struct mem_cgroup *memcg) 666 static void disarm_kmem_keys(struct mem_cgroup *memcg)
667 { 667 {
668 } 668 }
669 #endif /* CONFIG_MEMCG_KMEM */ 669 #endif /* CONFIG_MEMCG_KMEM */
670 670
671 static void disarm_static_keys(struct mem_cgroup *memcg) 671 static void disarm_static_keys(struct mem_cgroup *memcg)
672 { 672 {
673 disarm_sock_keys(memcg); 673 disarm_sock_keys(memcg);
674 disarm_kmem_keys(memcg); 674 disarm_kmem_keys(memcg);
675 } 675 }
676 676
677 static void drain_all_stock_async(struct mem_cgroup *memcg); 677 static void drain_all_stock_async(struct mem_cgroup *memcg);
678 678
679 static struct mem_cgroup_per_zone * 679 static struct mem_cgroup_per_zone *
680 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 680 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
681 { 681 {
682 VM_BUG_ON((unsigned)nid >= nr_node_ids); 682 VM_BUG_ON((unsigned)nid >= nr_node_ids);
683 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 683 return &memcg->nodeinfo[nid]->zoneinfo[zid];
684 } 684 }
685 685
686 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 686 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
687 { 687 {
688 return &memcg->css; 688 return &memcg->css;
689 } 689 }
690 690
691 static struct mem_cgroup_per_zone * 691 static struct mem_cgroup_per_zone *
692 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 692 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
693 { 693 {
694 int nid = page_to_nid(page); 694 int nid = page_to_nid(page);
695 int zid = page_zonenum(page); 695 int zid = page_zonenum(page);
696 696
697 return mem_cgroup_zoneinfo(memcg, nid, zid); 697 return mem_cgroup_zoneinfo(memcg, nid, zid);
698 } 698 }
699 699
700 static struct mem_cgroup_tree_per_zone * 700 static struct mem_cgroup_tree_per_zone *
701 soft_limit_tree_node_zone(int nid, int zid) 701 soft_limit_tree_node_zone(int nid, int zid)
702 { 702 {
703 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 703 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
704 } 704 }
705 705
706 static struct mem_cgroup_tree_per_zone * 706 static struct mem_cgroup_tree_per_zone *
707 soft_limit_tree_from_page(struct page *page) 707 soft_limit_tree_from_page(struct page *page)
708 { 708 {
709 int nid = page_to_nid(page); 709 int nid = page_to_nid(page);
710 int zid = page_zonenum(page); 710 int zid = page_zonenum(page);
711 711
712 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 712 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
713 } 713 }
714 714
715 static void 715 static void
716 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 716 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
717 struct mem_cgroup_per_zone *mz, 717 struct mem_cgroup_per_zone *mz,
718 struct mem_cgroup_tree_per_zone *mctz, 718 struct mem_cgroup_tree_per_zone *mctz,
719 unsigned long long new_usage_in_excess) 719 unsigned long long new_usage_in_excess)
720 { 720 {
721 struct rb_node **p = &mctz->rb_root.rb_node; 721 struct rb_node **p = &mctz->rb_root.rb_node;
722 struct rb_node *parent = NULL; 722 struct rb_node *parent = NULL;
723 struct mem_cgroup_per_zone *mz_node; 723 struct mem_cgroup_per_zone *mz_node;
724 724
725 if (mz->on_tree) 725 if (mz->on_tree)
726 return; 726 return;
727 727
728 mz->usage_in_excess = new_usage_in_excess; 728 mz->usage_in_excess = new_usage_in_excess;
729 if (!mz->usage_in_excess) 729 if (!mz->usage_in_excess)
730 return; 730 return;
731 while (*p) { 731 while (*p) {
732 parent = *p; 732 parent = *p;
733 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 733 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
734 tree_node); 734 tree_node);
735 if (mz->usage_in_excess < mz_node->usage_in_excess) 735 if (mz->usage_in_excess < mz_node->usage_in_excess)
736 p = &(*p)->rb_left; 736 p = &(*p)->rb_left;
737 /* 737 /*
738 * We can't avoid mem cgroups that are over their soft 738 * We can't avoid mem cgroups that are over their soft
739 * limit by the same amount 739 * limit by the same amount
740 */ 740 */
741 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 741 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
742 p = &(*p)->rb_right; 742 p = &(*p)->rb_right;
743 } 743 }
744 rb_link_node(&mz->tree_node, parent, p); 744 rb_link_node(&mz->tree_node, parent, p);
745 rb_insert_color(&mz->tree_node, &mctz->rb_root); 745 rb_insert_color(&mz->tree_node, &mctz->rb_root);
746 mz->on_tree = true; 746 mz->on_tree = true;
747 } 747 }
748 748
749 static void 749 static void
750 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 750 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
751 struct mem_cgroup_per_zone *mz, 751 struct mem_cgroup_per_zone *mz,
752 struct mem_cgroup_tree_per_zone *mctz) 752 struct mem_cgroup_tree_per_zone *mctz)
753 { 753 {
754 if (!mz->on_tree) 754 if (!mz->on_tree)
755 return; 755 return;
756 rb_erase(&mz->tree_node, &mctz->rb_root); 756 rb_erase(&mz->tree_node, &mctz->rb_root);
757 mz->on_tree = false; 757 mz->on_tree = false;
758 } 758 }
759 759
760 static void 760 static void
761 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 761 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
762 struct mem_cgroup_per_zone *mz, 762 struct mem_cgroup_per_zone *mz,
763 struct mem_cgroup_tree_per_zone *mctz) 763 struct mem_cgroup_tree_per_zone *mctz)
764 { 764 {
765 spin_lock(&mctz->lock); 765 spin_lock(&mctz->lock);
766 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 766 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
767 spin_unlock(&mctz->lock); 767 spin_unlock(&mctz->lock);
768 } 768 }
769 769
770 770
771 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 771 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
772 { 772 {
773 unsigned long long excess; 773 unsigned long long excess;
774 struct mem_cgroup_per_zone *mz; 774 struct mem_cgroup_per_zone *mz;
775 struct mem_cgroup_tree_per_zone *mctz; 775 struct mem_cgroup_tree_per_zone *mctz;
776 int nid = page_to_nid(page); 776 int nid = page_to_nid(page);
777 int zid = page_zonenum(page); 777 int zid = page_zonenum(page);
778 mctz = soft_limit_tree_from_page(page); 778 mctz = soft_limit_tree_from_page(page);
779 779
780 /* 780 /*
781 * Necessary to update all ancestors when hierarchy is used. 781 * Necessary to update all ancestors when hierarchy is used.
782 * because their event counter is not touched. 782 * because their event counter is not touched.
783 */ 783 */
784 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 784 for (; memcg; memcg = parent_mem_cgroup(memcg)) {
785 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 785 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
786 excess = res_counter_soft_limit_excess(&memcg->res); 786 excess = res_counter_soft_limit_excess(&memcg->res);
787 /* 787 /*
788 * We have to update the tree if mz is on RB-tree or 788 * We have to update the tree if mz is on RB-tree or
789 * mem is over its softlimit. 789 * mem is over its softlimit.
790 */ 790 */
791 if (excess || mz->on_tree) { 791 if (excess || mz->on_tree) {
792 spin_lock(&mctz->lock); 792 spin_lock(&mctz->lock);
793 /* if on-tree, remove it */ 793 /* if on-tree, remove it */
794 if (mz->on_tree) 794 if (mz->on_tree)
795 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 795 __mem_cgroup_remove_exceeded(memcg, mz, mctz);
796 /* 796 /*
797 * Insert again. mz->usage_in_excess will be updated. 797 * Insert again. mz->usage_in_excess will be updated.
798 * If excess is 0, no tree ops. 798 * If excess is 0, no tree ops.
799 */ 799 */
800 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 800 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
801 spin_unlock(&mctz->lock); 801 spin_unlock(&mctz->lock);
802 } 802 }
803 } 803 }
804 } 804 }
805 805
806 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 806 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
807 { 807 {
808 int node, zone; 808 int node, zone;
809 struct mem_cgroup_per_zone *mz; 809 struct mem_cgroup_per_zone *mz;
810 struct mem_cgroup_tree_per_zone *mctz; 810 struct mem_cgroup_tree_per_zone *mctz;
811 811
812 for_each_node(node) { 812 for_each_node(node) {
813 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 813 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
814 mz = mem_cgroup_zoneinfo(memcg, node, zone); 814 mz = mem_cgroup_zoneinfo(memcg, node, zone);
815 mctz = soft_limit_tree_node_zone(node, zone); 815 mctz = soft_limit_tree_node_zone(node, zone);
816 mem_cgroup_remove_exceeded(memcg, mz, mctz); 816 mem_cgroup_remove_exceeded(memcg, mz, mctz);
817 } 817 }
818 } 818 }
819 } 819 }
820 820
821 static struct mem_cgroup_per_zone * 821 static struct mem_cgroup_per_zone *
822 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 822 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
823 { 823 {
824 struct rb_node *rightmost = NULL; 824 struct rb_node *rightmost = NULL;
825 struct mem_cgroup_per_zone *mz; 825 struct mem_cgroup_per_zone *mz;
826 826
827 retry: 827 retry:
828 mz = NULL; 828 mz = NULL;
829 rightmost = rb_last(&mctz->rb_root); 829 rightmost = rb_last(&mctz->rb_root);
830 if (!rightmost) 830 if (!rightmost)
831 goto done; /* Nothing to reclaim from */ 831 goto done; /* Nothing to reclaim from */
832 832
833 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 833 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
834 /* 834 /*
835 * Remove the node now but someone else can add it back, 835 * Remove the node now but someone else can add it back,
836 * we will to add it back at the end of reclaim to its correct 836 * we will to add it back at the end of reclaim to its correct
837 * position in the tree. 837 * position in the tree.
838 */ 838 */
839 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 839 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
840 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 840 if (!res_counter_soft_limit_excess(&mz->memcg->res) ||
841 !css_tryget(&mz->memcg->css)) 841 !css_tryget(&mz->memcg->css))
842 goto retry; 842 goto retry;
843 done: 843 done:
844 return mz; 844 return mz;
845 } 845 }
846 846
847 static struct mem_cgroup_per_zone * 847 static struct mem_cgroup_per_zone *
848 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 848 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
849 { 849 {
850 struct mem_cgroup_per_zone *mz; 850 struct mem_cgroup_per_zone *mz;
851 851
852 spin_lock(&mctz->lock); 852 spin_lock(&mctz->lock);
853 mz = __mem_cgroup_largest_soft_limit_node(mctz); 853 mz = __mem_cgroup_largest_soft_limit_node(mctz);
854 spin_unlock(&mctz->lock); 854 spin_unlock(&mctz->lock);
855 return mz; 855 return mz;
856 } 856 }
857 857
858 /* 858 /*
859 * Implementation Note: reading percpu statistics for memcg. 859 * Implementation Note: reading percpu statistics for memcg.
860 * 860 *
861 * Both of vmstat[] and percpu_counter has threshold and do periodic 861 * Both of vmstat[] and percpu_counter has threshold and do periodic
862 * synchronization to implement "quick" read. There are trade-off between 862 * synchronization to implement "quick" read. There are trade-off between
863 * reading cost and precision of value. Then, we may have a chance to implement 863 * reading cost and precision of value. Then, we may have a chance to implement
864 * a periodic synchronizion of counter in memcg's counter. 864 * a periodic synchronizion of counter in memcg's counter.
865 * 865 *
866 * But this _read() function is used for user interface now. The user accounts 866 * But this _read() function is used for user interface now. The user accounts
867 * memory usage by memory cgroup and he _always_ requires exact value because 867 * memory usage by memory cgroup and he _always_ requires exact value because
868 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 868 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
869 * have to visit all online cpus and make sum. So, for now, unnecessary 869 * have to visit all online cpus and make sum. So, for now, unnecessary
870 * synchronization is not implemented. (just implemented for cpu hotplug) 870 * synchronization is not implemented. (just implemented for cpu hotplug)
871 * 871 *
872 * If there are kernel internal actions which can make use of some not-exact 872 * If there are kernel internal actions which can make use of some not-exact
873 * value, and reading all cpu value can be performance bottleneck in some 873 * value, and reading all cpu value can be performance bottleneck in some
874 * common workload, threashold and synchonization as vmstat[] should be 874 * common workload, threashold and synchonization as vmstat[] should be
875 * implemented. 875 * implemented.
876 */ 876 */
877 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 877 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
878 enum mem_cgroup_stat_index idx) 878 enum mem_cgroup_stat_index idx)
879 { 879 {
880 long val = 0; 880 long val = 0;
881 int cpu; 881 int cpu;
882 882
883 get_online_cpus(); 883 get_online_cpus();
884 for_each_online_cpu(cpu) 884 for_each_online_cpu(cpu)
885 val += per_cpu(memcg->stat->count[idx], cpu); 885 val += per_cpu(memcg->stat->count[idx], cpu);
886 #ifdef CONFIG_HOTPLUG_CPU 886 #ifdef CONFIG_HOTPLUG_CPU
887 spin_lock(&memcg->pcp_counter_lock); 887 spin_lock(&memcg->pcp_counter_lock);
888 val += memcg->nocpu_base.count[idx]; 888 val += memcg->nocpu_base.count[idx];
889 spin_unlock(&memcg->pcp_counter_lock); 889 spin_unlock(&memcg->pcp_counter_lock);
890 #endif 890 #endif
891 put_online_cpus(); 891 put_online_cpus();
892 return val; 892 return val;
893 } 893 }
894 894
895 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 895 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
896 bool charge) 896 bool charge)
897 { 897 {
898 int val = (charge) ? 1 : -1; 898 int val = (charge) ? 1 : -1;
899 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 899 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
900 } 900 }
901 901
902 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 902 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
903 enum mem_cgroup_events_index idx) 903 enum mem_cgroup_events_index idx)
904 { 904 {
905 unsigned long val = 0; 905 unsigned long val = 0;
906 int cpu; 906 int cpu;
907 907
908 get_online_cpus(); 908 get_online_cpus();
909 for_each_online_cpu(cpu) 909 for_each_online_cpu(cpu)
910 val += per_cpu(memcg->stat->events[idx], cpu); 910 val += per_cpu(memcg->stat->events[idx], cpu);
911 #ifdef CONFIG_HOTPLUG_CPU 911 #ifdef CONFIG_HOTPLUG_CPU
912 spin_lock(&memcg->pcp_counter_lock); 912 spin_lock(&memcg->pcp_counter_lock);
913 val += memcg->nocpu_base.events[idx]; 913 val += memcg->nocpu_base.events[idx];
914 spin_unlock(&memcg->pcp_counter_lock); 914 spin_unlock(&memcg->pcp_counter_lock);
915 #endif 915 #endif
916 put_online_cpus(); 916 put_online_cpus();
917 return val; 917 return val;
918 } 918 }
919 919
920 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 920 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
921 struct page *page, 921 struct page *page,
922 bool anon, int nr_pages) 922 bool anon, int nr_pages)
923 { 923 {
924 /* 924 /*
925 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 925 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
926 * counted as CACHE even if it's on ANON LRU. 926 * counted as CACHE even if it's on ANON LRU.
927 */ 927 */
928 if (anon) 928 if (anon)
929 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 929 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
930 nr_pages); 930 nr_pages);
931 else 931 else
932 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 932 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
933 nr_pages); 933 nr_pages);
934 934
935 if (PageTransHuge(page)) 935 if (PageTransHuge(page))
936 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 936 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
937 nr_pages); 937 nr_pages);
938 938
939 /* pagein of a big page is an event. So, ignore page size */ 939 /* pagein of a big page is an event. So, ignore page size */
940 if (nr_pages > 0) 940 if (nr_pages > 0)
941 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 941 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
942 else { 942 else {
943 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 943 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
944 nr_pages = -nr_pages; /* for event */ 944 nr_pages = -nr_pages; /* for event */
945 } 945 }
946 946
947 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 947 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
948 } 948 }
949 949
950 unsigned long 950 unsigned long
951 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 951 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
952 { 952 {
953 struct mem_cgroup_per_zone *mz; 953 struct mem_cgroup_per_zone *mz;
954 954
955 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 955 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
956 return mz->lru_size[lru]; 956 return mz->lru_size[lru];
957 } 957 }
958 958
959 static unsigned long 959 static unsigned long
960 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 960 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
961 unsigned int lru_mask) 961 unsigned int lru_mask)
962 { 962 {
963 struct mem_cgroup_per_zone *mz; 963 struct mem_cgroup_per_zone *mz;
964 enum lru_list lru; 964 enum lru_list lru;
965 unsigned long ret = 0; 965 unsigned long ret = 0;
966 966
967 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 967 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
968 968
969 for_each_lru(lru) { 969 for_each_lru(lru) {
970 if (BIT(lru) & lru_mask) 970 if (BIT(lru) & lru_mask)
971 ret += mz->lru_size[lru]; 971 ret += mz->lru_size[lru];
972 } 972 }
973 return ret; 973 return ret;
974 } 974 }
975 975
976 static unsigned long 976 static unsigned long
977 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 977 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
978 int nid, unsigned int lru_mask) 978 int nid, unsigned int lru_mask)
979 { 979 {
980 u64 total = 0; 980 u64 total = 0;
981 int zid; 981 int zid;
982 982
983 for (zid = 0; zid < MAX_NR_ZONES; zid++) 983 for (zid = 0; zid < MAX_NR_ZONES; zid++)
984 total += mem_cgroup_zone_nr_lru_pages(memcg, 984 total += mem_cgroup_zone_nr_lru_pages(memcg,
985 nid, zid, lru_mask); 985 nid, zid, lru_mask);
986 986
987 return total; 987 return total;
988 } 988 }
989 989
990 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 990 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
991 unsigned int lru_mask) 991 unsigned int lru_mask)
992 { 992 {
993 int nid; 993 int nid;
994 u64 total = 0; 994 u64 total = 0;
995 995
996 for_each_node_state(nid, N_MEMORY) 996 for_each_node_state(nid, N_MEMORY)
997 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 997 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
998 return total; 998 return total;
999 } 999 }
1000 1000
1001 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 1001 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
1002 enum mem_cgroup_events_target target) 1002 enum mem_cgroup_events_target target)
1003 { 1003 {
1004 unsigned long val, next; 1004 unsigned long val, next;
1005 1005
1006 val = __this_cpu_read(memcg->stat->nr_page_events); 1006 val = __this_cpu_read(memcg->stat->nr_page_events);
1007 next = __this_cpu_read(memcg->stat->targets[target]); 1007 next = __this_cpu_read(memcg->stat->targets[target]);
1008 /* from time_after() in jiffies.h */ 1008 /* from time_after() in jiffies.h */
1009 if ((long)next - (long)val < 0) { 1009 if ((long)next - (long)val < 0) {
1010 switch (target) { 1010 switch (target) {
1011 case MEM_CGROUP_TARGET_THRESH: 1011 case MEM_CGROUP_TARGET_THRESH:
1012 next = val + THRESHOLDS_EVENTS_TARGET; 1012 next = val + THRESHOLDS_EVENTS_TARGET;
1013 break; 1013 break;
1014 case MEM_CGROUP_TARGET_SOFTLIMIT: 1014 case MEM_CGROUP_TARGET_SOFTLIMIT:
1015 next = val + SOFTLIMIT_EVENTS_TARGET; 1015 next = val + SOFTLIMIT_EVENTS_TARGET;
1016 break; 1016 break;
1017 case MEM_CGROUP_TARGET_NUMAINFO: 1017 case MEM_CGROUP_TARGET_NUMAINFO:
1018 next = val + NUMAINFO_EVENTS_TARGET; 1018 next = val + NUMAINFO_EVENTS_TARGET;
1019 break; 1019 break;
1020 default: 1020 default:
1021 break; 1021 break;
1022 } 1022 }
1023 __this_cpu_write(memcg->stat->targets[target], next); 1023 __this_cpu_write(memcg->stat->targets[target], next);
1024 return true; 1024 return true;
1025 } 1025 }
1026 return false; 1026 return false;
1027 } 1027 }
1028 1028
1029 /* 1029 /*
1030 * Check events in order. 1030 * Check events in order.
1031 * 1031 *
1032 */ 1032 */
1033 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1033 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
1034 { 1034 {
1035 preempt_disable(); 1035 preempt_disable();
1036 /* threshold event is triggered in finer grain than soft limit */ 1036 /* threshold event is triggered in finer grain than soft limit */
1037 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1037 if (unlikely(mem_cgroup_event_ratelimit(memcg,
1038 MEM_CGROUP_TARGET_THRESH))) { 1038 MEM_CGROUP_TARGET_THRESH))) {
1039 bool do_softlimit; 1039 bool do_softlimit;
1040 bool do_numainfo __maybe_unused; 1040 bool do_numainfo __maybe_unused;
1041 1041
1042 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1042 do_softlimit = mem_cgroup_event_ratelimit(memcg,
1043 MEM_CGROUP_TARGET_SOFTLIMIT); 1043 MEM_CGROUP_TARGET_SOFTLIMIT);
1044 #if MAX_NUMNODES > 1 1044 #if MAX_NUMNODES > 1
1045 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1045 do_numainfo = mem_cgroup_event_ratelimit(memcg,
1046 MEM_CGROUP_TARGET_NUMAINFO); 1046 MEM_CGROUP_TARGET_NUMAINFO);
1047 #endif 1047 #endif
1048 preempt_enable(); 1048 preempt_enable();
1049 1049
1050 mem_cgroup_threshold(memcg); 1050 mem_cgroup_threshold(memcg);
1051 if (unlikely(do_softlimit)) 1051 if (unlikely(do_softlimit))
1052 mem_cgroup_update_tree(memcg, page); 1052 mem_cgroup_update_tree(memcg, page);
1053 #if MAX_NUMNODES > 1 1053 #if MAX_NUMNODES > 1
1054 if (unlikely(do_numainfo)) 1054 if (unlikely(do_numainfo))
1055 atomic_inc(&memcg->numainfo_events); 1055 atomic_inc(&memcg->numainfo_events);
1056 #endif 1056 #endif
1057 } else 1057 } else
1058 preempt_enable(); 1058 preempt_enable();
1059 } 1059 }
1060 1060
1061 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1061 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
1062 { 1062 {
1063 /* 1063 /*
1064 * mm_update_next_owner() may clear mm->owner to NULL 1064 * mm_update_next_owner() may clear mm->owner to NULL
1065 * if it races with swapoff, page migration, etc. 1065 * if it races with swapoff, page migration, etc.
1066 * So this can be called with p == NULL. 1066 * So this can be called with p == NULL.
1067 */ 1067 */
1068 if (unlikely(!p)) 1068 if (unlikely(!p))
1069 return NULL; 1069 return NULL;
1070 1070
1071 return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); 1071 return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
1072 } 1072 }
1073 1073
1074 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) 1074 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
1075 { 1075 {
1076 struct mem_cgroup *memcg = NULL; 1076 struct mem_cgroup *memcg = NULL;
1077 1077
1078 rcu_read_lock(); 1078 rcu_read_lock();
1079 do { 1079 do {
1080 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1080 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1081 if (unlikely(!memcg)) 1081 if (unlikely(!memcg))
1082 memcg = root_mem_cgroup; 1082 memcg = root_mem_cgroup;
1083 } while (!css_tryget(&memcg->css)); 1083 } while (!css_tryget(&memcg->css));
1084 rcu_read_unlock(); 1084 rcu_read_unlock();
1085 return memcg; 1085 return memcg;
1086 } 1086 }
1087 1087
1088 /* 1088 /*
1089 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1089 * Returns a next (in a pre-order walk) alive memcg (with elevated css
1090 * ref. count) or NULL if the whole root's subtree has been visited. 1090 * ref. count) or NULL if the whole root's subtree has been visited.
1091 * 1091 *
1092 * helper function to be used by mem_cgroup_iter 1092 * helper function to be used by mem_cgroup_iter
1093 */ 1093 */
1094 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1094 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root,
1095 struct mem_cgroup *last_visited) 1095 struct mem_cgroup *last_visited)
1096 { 1096 {
1097 struct cgroup_subsys_state *prev_css, *next_css; 1097 struct cgroup_subsys_state *prev_css, *next_css;
1098 1098
1099 prev_css = last_visited ? &last_visited->css : NULL; 1099 prev_css = last_visited ? &last_visited->css : NULL;
1100 skip_node: 1100 skip_node:
1101 next_css = css_next_descendant_pre(prev_css, &root->css); 1101 next_css = css_next_descendant_pre(prev_css, &root->css);
1102 1102
1103 /* 1103 /*
1104 * Even if we found a group we have to make sure it is 1104 * Even if we found a group we have to make sure it is
1105 * alive. css && !memcg means that the groups should be 1105 * alive. css && !memcg means that the groups should be
1106 * skipped and we should continue the tree walk. 1106 * skipped and we should continue the tree walk.
1107 * last_visited css is safe to use because it is 1107 * last_visited css is safe to use because it is
1108 * protected by css_get and the tree walk is rcu safe. 1108 * protected by css_get and the tree walk is rcu safe.
1109 * 1109 *
1110 * We do not take a reference on the root of the tree walk 1110 * We do not take a reference on the root of the tree walk
1111 * because we might race with the root removal when it would 1111 * because we might race with the root removal when it would
1112 * be the only node in the iterated hierarchy and mem_cgroup_iter 1112 * be the only node in the iterated hierarchy and mem_cgroup_iter
1113 * would end up in an endless loop because it expects that at 1113 * would end up in an endless loop because it expects that at
1114 * least one valid node will be returned. Root cannot disappear 1114 * least one valid node will be returned. Root cannot disappear
1115 * because caller of the iterator should hold it already so 1115 * because caller of the iterator should hold it already so
1116 * skipping css reference should be safe. 1116 * skipping css reference should be safe.
1117 */ 1117 */
1118 if (next_css) { 1118 if (next_css) {
1119 if ((next_css == &root->css) || 1119 if ((next_css == &root->css) ||
1120 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) 1120 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
1121 return mem_cgroup_from_css(next_css); 1121 return mem_cgroup_from_css(next_css);
1122 1122
1123 prev_css = next_css; 1123 prev_css = next_css;
1124 goto skip_node; 1124 goto skip_node;
1125 } 1125 }
1126 1126
1127 return NULL; 1127 return NULL;
1128 } 1128 }
1129 1129
1130 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) 1130 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root)
1131 { 1131 {
1132 /* 1132 /*
1133 * When a group in the hierarchy below root is destroyed, the 1133 * When a group in the hierarchy below root is destroyed, the
1134 * hierarchy iterator can no longer be trusted since it might 1134 * hierarchy iterator can no longer be trusted since it might
1135 * have pointed to the destroyed group. Invalidate it. 1135 * have pointed to the destroyed group. Invalidate it.
1136 */ 1136 */
1137 atomic_inc(&root->dead_count); 1137 atomic_inc(&root->dead_count);
1138 } 1138 }
1139 1139
1140 static struct mem_cgroup * 1140 static struct mem_cgroup *
1141 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, 1141 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
1142 struct mem_cgroup *root, 1142 struct mem_cgroup *root,
1143 int *sequence) 1143 int *sequence)
1144 { 1144 {
1145 struct mem_cgroup *position = NULL; 1145 struct mem_cgroup *position = NULL;
1146 /* 1146 /*
1147 * A cgroup destruction happens in two stages: offlining and 1147 * A cgroup destruction happens in two stages: offlining and
1148 * release. They are separated by a RCU grace period. 1148 * release. They are separated by a RCU grace period.
1149 * 1149 *
1150 * If the iterator is valid, we may still race with an 1150 * If the iterator is valid, we may still race with an
1151 * offlining. The RCU lock ensures the object won't be 1151 * offlining. The RCU lock ensures the object won't be
1152 * released, tryget will fail if we lost the race. 1152 * released, tryget will fail if we lost the race.
1153 */ 1153 */
1154 *sequence = atomic_read(&root->dead_count); 1154 *sequence = atomic_read(&root->dead_count);
1155 if (iter->last_dead_count == *sequence) { 1155 if (iter->last_dead_count == *sequence) {
1156 smp_rmb(); 1156 smp_rmb();
1157 position = iter->last_visited; 1157 position = iter->last_visited;
1158 1158
1159 /* 1159 /*
1160 * We cannot take a reference to root because we might race 1160 * We cannot take a reference to root because we might race
1161 * with root removal and returning NULL would end up in 1161 * with root removal and returning NULL would end up in
1162 * an endless loop on the iterator user level when root 1162 * an endless loop on the iterator user level when root
1163 * would be returned all the time. 1163 * would be returned all the time.
1164 */ 1164 */
1165 if (position && position != root && 1165 if (position && position != root &&
1166 !css_tryget(&position->css)) 1166 !css_tryget(&position->css))
1167 position = NULL; 1167 position = NULL;
1168 } 1168 }
1169 return position; 1169 return position;
1170 } 1170 }
1171 1171
1172 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1172 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
1173 struct mem_cgroup *last_visited, 1173 struct mem_cgroup *last_visited,
1174 struct mem_cgroup *new_position, 1174 struct mem_cgroup *new_position,
1175 struct mem_cgroup *root, 1175 struct mem_cgroup *root,
1176 int sequence) 1176 int sequence)
1177 { 1177 {
1178 /* root reference counting symmetric to mem_cgroup_iter_load */ 1178 /* root reference counting symmetric to mem_cgroup_iter_load */
1179 if (last_visited && last_visited != root) 1179 if (last_visited && last_visited != root)
1180 css_put(&last_visited->css); 1180 css_put(&last_visited->css);
1181 /* 1181 /*
1182 * We store the sequence count from the time @last_visited was 1182 * We store the sequence count from the time @last_visited was
1183 * loaded successfully instead of rereading it here so that we 1183 * loaded successfully instead of rereading it here so that we
1184 * don't lose destruction events in between. We could have 1184 * don't lose destruction events in between. We could have
1185 * raced with the destruction of @new_position after all. 1185 * raced with the destruction of @new_position after all.
1186 */ 1186 */
1187 iter->last_visited = new_position; 1187 iter->last_visited = new_position;
1188 smp_wmb(); 1188 smp_wmb();
1189 iter->last_dead_count = sequence; 1189 iter->last_dead_count = sequence;
1190 } 1190 }
1191 1191
1192 /** 1192 /**
1193 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1193 * mem_cgroup_iter - iterate over memory cgroup hierarchy
1194 * @root: hierarchy root 1194 * @root: hierarchy root
1195 * @prev: previously returned memcg, NULL on first invocation 1195 * @prev: previously returned memcg, NULL on first invocation
1196 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1196 * @reclaim: cookie for shared reclaim walks, NULL for full walks
1197 * 1197 *
1198 * Returns references to children of the hierarchy below @root, or 1198 * Returns references to children of the hierarchy below @root, or
1199 * @root itself, or %NULL after a full round-trip. 1199 * @root itself, or %NULL after a full round-trip.
1200 * 1200 *
1201 * Caller must pass the return value in @prev on subsequent 1201 * Caller must pass the return value in @prev on subsequent
1202 * invocations for reference counting, or use mem_cgroup_iter_break() 1202 * invocations for reference counting, or use mem_cgroup_iter_break()
1203 * to cancel a hierarchy walk before the round-trip is complete. 1203 * to cancel a hierarchy walk before the round-trip is complete.
1204 * 1204 *
1205 * Reclaimers can specify a zone and a priority level in @reclaim to 1205 * Reclaimers can specify a zone and a priority level in @reclaim to
1206 * divide up the memcgs in the hierarchy among all concurrent 1206 * divide up the memcgs in the hierarchy among all concurrent
1207 * reclaimers operating on the same zone and priority. 1207 * reclaimers operating on the same zone and priority.
1208 */ 1208 */
1209 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1209 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1210 struct mem_cgroup *prev, 1210 struct mem_cgroup *prev,
1211 struct mem_cgroup_reclaim_cookie *reclaim) 1211 struct mem_cgroup_reclaim_cookie *reclaim)
1212 { 1212 {
1213 struct mem_cgroup *memcg = NULL; 1213 struct mem_cgroup *memcg = NULL;
1214 struct mem_cgroup *last_visited = NULL; 1214 struct mem_cgroup *last_visited = NULL;
1215 1215
1216 if (mem_cgroup_disabled()) 1216 if (mem_cgroup_disabled())
1217 return NULL; 1217 return NULL;
1218 1218
1219 if (!root) 1219 if (!root)
1220 root = root_mem_cgroup; 1220 root = root_mem_cgroup;
1221 1221
1222 if (prev && !reclaim) 1222 if (prev && !reclaim)
1223 last_visited = prev; 1223 last_visited = prev;
1224 1224
1225 if (!root->use_hierarchy && root != root_mem_cgroup) { 1225 if (!root->use_hierarchy && root != root_mem_cgroup) {
1226 if (prev) 1226 if (prev)
1227 goto out_css_put; 1227 goto out_css_put;
1228 return root; 1228 return root;
1229 } 1229 }
1230 1230
1231 rcu_read_lock(); 1231 rcu_read_lock();
1232 while (!memcg) { 1232 while (!memcg) {
1233 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1233 struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
1234 int uninitialized_var(seq); 1234 int uninitialized_var(seq);
1235 1235
1236 if (reclaim) { 1236 if (reclaim) {
1237 int nid = zone_to_nid(reclaim->zone); 1237 int nid = zone_to_nid(reclaim->zone);
1238 int zid = zone_idx(reclaim->zone); 1238 int zid = zone_idx(reclaim->zone);
1239 struct mem_cgroup_per_zone *mz; 1239 struct mem_cgroup_per_zone *mz;
1240 1240
1241 mz = mem_cgroup_zoneinfo(root, nid, zid); 1241 mz = mem_cgroup_zoneinfo(root, nid, zid);
1242 iter = &mz->reclaim_iter[reclaim->priority]; 1242 iter = &mz->reclaim_iter[reclaim->priority];
1243 if (prev && reclaim->generation != iter->generation) { 1243 if (prev && reclaim->generation != iter->generation) {
1244 iter->last_visited = NULL; 1244 iter->last_visited = NULL;
1245 goto out_unlock; 1245 goto out_unlock;
1246 } 1246 }
1247 1247
1248 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1248 last_visited = mem_cgroup_iter_load(iter, root, &seq);
1249 } 1249 }
1250 1250
1251 memcg = __mem_cgroup_iter_next(root, last_visited); 1251 memcg = __mem_cgroup_iter_next(root, last_visited);
1252 1252
1253 if (reclaim) { 1253 if (reclaim) {
1254 mem_cgroup_iter_update(iter, last_visited, memcg, root, 1254 mem_cgroup_iter_update(iter, last_visited, memcg, root,
1255 seq); 1255 seq);
1256 1256
1257 if (!memcg) 1257 if (!memcg)
1258 iter->generation++; 1258 iter->generation++;
1259 else if (!prev && memcg) 1259 else if (!prev && memcg)
1260 reclaim->generation = iter->generation; 1260 reclaim->generation = iter->generation;
1261 } 1261 }
1262 1262
1263 if (prev && !memcg) 1263 if (prev && !memcg)
1264 goto out_unlock; 1264 goto out_unlock;
1265 } 1265 }
1266 out_unlock: 1266 out_unlock:
1267 rcu_read_unlock(); 1267 rcu_read_unlock();
1268 out_css_put: 1268 out_css_put:
1269 if (prev && prev != root) 1269 if (prev && prev != root)
1270 css_put(&prev->css); 1270 css_put(&prev->css);
1271 1271
1272 return memcg; 1272 return memcg;
1273 } 1273 }
1274 1274
1275 /** 1275 /**
1276 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1276 * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1277 * @root: hierarchy root 1277 * @root: hierarchy root
1278 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1278 * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1279 */ 1279 */
1280 void mem_cgroup_iter_break(struct mem_cgroup *root, 1280 void mem_cgroup_iter_break(struct mem_cgroup *root,
1281 struct mem_cgroup *prev) 1281 struct mem_cgroup *prev)
1282 { 1282 {
1283 if (!root) 1283 if (!root)
1284 root = root_mem_cgroup; 1284 root = root_mem_cgroup;
1285 if (prev && prev != root) 1285 if (prev && prev != root)
1286 css_put(&prev->css); 1286 css_put(&prev->css);
1287 } 1287 }
1288 1288
1289 /* 1289 /*
1290 * Iteration constructs for visiting all cgroups (under a tree). If 1290 * Iteration constructs for visiting all cgroups (under a tree). If
1291 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1291 * loops are exited prematurely (break), mem_cgroup_iter_break() must
1292 * be used for reference counting. 1292 * be used for reference counting.
1293 */ 1293 */
1294 #define for_each_mem_cgroup_tree(iter, root) \ 1294 #define for_each_mem_cgroup_tree(iter, root) \
1295 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1295 for (iter = mem_cgroup_iter(root, NULL, NULL); \
1296 iter != NULL; \ 1296 iter != NULL; \
1297 iter = mem_cgroup_iter(root, iter, NULL)) 1297 iter = mem_cgroup_iter(root, iter, NULL))
1298 1298
1299 #define for_each_mem_cgroup(iter) \ 1299 #define for_each_mem_cgroup(iter) \
1300 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1300 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
1301 iter != NULL; \ 1301 iter != NULL; \
1302 iter = mem_cgroup_iter(NULL, iter, NULL)) 1302 iter = mem_cgroup_iter(NULL, iter, NULL))
1303 1303
1304 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1304 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1305 { 1305 {
1306 struct mem_cgroup *memcg; 1306 struct mem_cgroup *memcg;
1307 1307
1308 rcu_read_lock(); 1308 rcu_read_lock();
1309 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1309 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1310 if (unlikely(!memcg)) 1310 if (unlikely(!memcg))
1311 goto out; 1311 goto out;
1312 1312
1313 switch (idx) { 1313 switch (idx) {
1314 case PGFAULT: 1314 case PGFAULT:
1315 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1315 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1316 break; 1316 break;
1317 case PGMAJFAULT: 1317 case PGMAJFAULT:
1318 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1318 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1319 break; 1319 break;
1320 default: 1320 default:
1321 BUG(); 1321 BUG();
1322 } 1322 }
1323 out: 1323 out:
1324 rcu_read_unlock(); 1324 rcu_read_unlock();
1325 } 1325 }
1326 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1326 EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1327 1327
1328 /** 1328 /**
1329 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1329 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1330 * @zone: zone of the wanted lruvec 1330 * @zone: zone of the wanted lruvec
1331 * @memcg: memcg of the wanted lruvec 1331 * @memcg: memcg of the wanted lruvec
1332 * 1332 *
1333 * Returns the lru list vector holding pages for the given @zone and 1333 * Returns the lru list vector holding pages for the given @zone and
1334 * @mem. This can be the global zone lruvec, if the memory controller 1334 * @mem. This can be the global zone lruvec, if the memory controller
1335 * is disabled. 1335 * is disabled.
1336 */ 1336 */
1337 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1337 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1338 struct mem_cgroup *memcg) 1338 struct mem_cgroup *memcg)
1339 { 1339 {
1340 struct mem_cgroup_per_zone *mz; 1340 struct mem_cgroup_per_zone *mz;
1341 struct lruvec *lruvec; 1341 struct lruvec *lruvec;
1342 1342
1343 if (mem_cgroup_disabled()) { 1343 if (mem_cgroup_disabled()) {
1344 lruvec = &zone->lruvec; 1344 lruvec = &zone->lruvec;
1345 goto out; 1345 goto out;
1346 } 1346 }
1347 1347
1348 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1348 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone));
1349 lruvec = &mz->lruvec; 1349 lruvec = &mz->lruvec;
1350 out: 1350 out:
1351 /* 1351 /*
1352 * Since a node can be onlined after the mem_cgroup was created, 1352 * Since a node can be onlined after the mem_cgroup was created,
1353 * we have to be prepared to initialize lruvec->zone here; 1353 * we have to be prepared to initialize lruvec->zone here;
1354 * and if offlined then reonlined, we need to reinitialize it. 1354 * and if offlined then reonlined, we need to reinitialize it.
1355 */ 1355 */
1356 if (unlikely(lruvec->zone != zone)) 1356 if (unlikely(lruvec->zone != zone))
1357 lruvec->zone = zone; 1357 lruvec->zone = zone;
1358 return lruvec; 1358 return lruvec;
1359 } 1359 }
1360 1360
1361 /* 1361 /*
1362 * Following LRU functions are allowed to be used without PCG_LOCK. 1362 * Following LRU functions are allowed to be used without PCG_LOCK.
1363 * Operations are called by routine of global LRU independently from memcg. 1363 * Operations are called by routine of global LRU independently from memcg.
1364 * What we have to take care of here is validness of pc->mem_cgroup. 1364 * What we have to take care of here is validness of pc->mem_cgroup.
1365 * 1365 *
1366 * Changes to pc->mem_cgroup happens when 1366 * Changes to pc->mem_cgroup happens when
1367 * 1. charge 1367 * 1. charge
1368 * 2. moving account 1368 * 2. moving account
1369 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1369 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
1370 * It is added to LRU before charge. 1370 * It is added to LRU before charge.
1371 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1371 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
1372 * When moving account, the page is not on LRU. It's isolated. 1372 * When moving account, the page is not on LRU. It's isolated.
1373 */ 1373 */
1374 1374
1375 /** 1375 /**
1376 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1376 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1377 * @page: the page 1377 * @page: the page
1378 * @zone: zone of the page 1378 * @zone: zone of the page
1379 */ 1379 */
1380 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1380 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1381 { 1381 {
1382 struct mem_cgroup_per_zone *mz; 1382 struct mem_cgroup_per_zone *mz;
1383 struct mem_cgroup *memcg; 1383 struct mem_cgroup *memcg;
1384 struct page_cgroup *pc; 1384 struct page_cgroup *pc;
1385 struct lruvec *lruvec; 1385 struct lruvec *lruvec;
1386 1386
1387 if (mem_cgroup_disabled()) { 1387 if (mem_cgroup_disabled()) {
1388 lruvec = &zone->lruvec; 1388 lruvec = &zone->lruvec;
1389 goto out; 1389 goto out;
1390 } 1390 }
1391 1391
1392 pc = lookup_page_cgroup(page); 1392 pc = lookup_page_cgroup(page);
1393 memcg = pc->mem_cgroup; 1393 memcg = pc->mem_cgroup;
1394 1394
1395 /* 1395 /*
1396 * Surreptitiously switch any uncharged offlist page to root: 1396 * Surreptitiously switch any uncharged offlist page to root:
1397 * an uncharged page off lru does nothing to secure 1397 * an uncharged page off lru does nothing to secure
1398 * its former mem_cgroup from sudden removal. 1398 * its former mem_cgroup from sudden removal.
1399 * 1399 *
1400 * Our caller holds lru_lock, and PageCgroupUsed is updated 1400 * Our caller holds lru_lock, and PageCgroupUsed is updated
1401 * under page_cgroup lock: between them, they make all uses 1401 * under page_cgroup lock: between them, they make all uses
1402 * of pc->mem_cgroup safe. 1402 * of pc->mem_cgroup safe.
1403 */ 1403 */
1404 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1404 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1405 pc->mem_cgroup = memcg = root_mem_cgroup; 1405 pc->mem_cgroup = memcg = root_mem_cgroup;
1406 1406
1407 mz = page_cgroup_zoneinfo(memcg, page); 1407 mz = page_cgroup_zoneinfo(memcg, page);
1408 lruvec = &mz->lruvec; 1408 lruvec = &mz->lruvec;
1409 out: 1409 out:
1410 /* 1410 /*
1411 * Since a node can be onlined after the mem_cgroup was created, 1411 * Since a node can be onlined after the mem_cgroup was created,
1412 * we have to be prepared to initialize lruvec->zone here; 1412 * we have to be prepared to initialize lruvec->zone here;
1413 * and if offlined then reonlined, we need to reinitialize it. 1413 * and if offlined then reonlined, we need to reinitialize it.
1414 */ 1414 */
1415 if (unlikely(lruvec->zone != zone)) 1415 if (unlikely(lruvec->zone != zone))
1416 lruvec->zone = zone; 1416 lruvec->zone = zone;
1417 return lruvec; 1417 return lruvec;
1418 } 1418 }
1419 1419
1420 /** 1420 /**
1421 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1421 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1422 * @lruvec: mem_cgroup per zone lru vector 1422 * @lruvec: mem_cgroup per zone lru vector
1423 * @lru: index of lru list the page is sitting on 1423 * @lru: index of lru list the page is sitting on
1424 * @nr_pages: positive when adding or negative when removing 1424 * @nr_pages: positive when adding or negative when removing
1425 * 1425 *
1426 * This function must be called when a page is added to or removed from an 1426 * This function must be called when a page is added to or removed from an
1427 * lru list. 1427 * lru list.
1428 */ 1428 */
1429 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1429 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1430 int nr_pages) 1430 int nr_pages)
1431 { 1431 {
1432 struct mem_cgroup_per_zone *mz; 1432 struct mem_cgroup_per_zone *mz;
1433 unsigned long *lru_size; 1433 unsigned long *lru_size;
1434 1434
1435 if (mem_cgroup_disabled()) 1435 if (mem_cgroup_disabled())
1436 return; 1436 return;
1437 1437
1438 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1438 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1439 lru_size = mz->lru_size + lru; 1439 lru_size = mz->lru_size + lru;
1440 *lru_size += nr_pages; 1440 *lru_size += nr_pages;
1441 VM_BUG_ON((long)(*lru_size) < 0); 1441 VM_BUG_ON((long)(*lru_size) < 0);
1442 } 1442 }
1443 1443
1444 /* 1444 /*
1445 * Checks whether given mem is same or in the root_mem_cgroup's 1445 * Checks whether given mem is same or in the root_mem_cgroup's
1446 * hierarchy subtree 1446 * hierarchy subtree
1447 */ 1447 */
1448 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1448 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1449 struct mem_cgroup *memcg) 1449 struct mem_cgroup *memcg)
1450 { 1450 {
1451 if (root_memcg == memcg) 1451 if (root_memcg == memcg)
1452 return true; 1452 return true;
1453 if (!root_memcg->use_hierarchy || !memcg) 1453 if (!root_memcg->use_hierarchy || !memcg)
1454 return false; 1454 return false;
1455 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); 1455 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
1456 } 1456 }
1457 1457
1458 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1458 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1459 struct mem_cgroup *memcg) 1459 struct mem_cgroup *memcg)
1460 { 1460 {
1461 bool ret; 1461 bool ret;
1462 1462
1463 rcu_read_lock(); 1463 rcu_read_lock();
1464 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1464 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1465 rcu_read_unlock(); 1465 rcu_read_unlock();
1466 return ret; 1466 return ret;
1467 } 1467 }
1468 1468
1469 bool task_in_mem_cgroup(struct task_struct *task, 1469 bool task_in_mem_cgroup(struct task_struct *task,
1470 const struct mem_cgroup *memcg) 1470 const struct mem_cgroup *memcg)
1471 { 1471 {
1472 struct mem_cgroup *curr = NULL; 1472 struct mem_cgroup *curr = NULL;
1473 struct task_struct *p; 1473 struct task_struct *p;
1474 bool ret; 1474 bool ret;
1475 1475
1476 p = find_lock_task_mm(task); 1476 p = find_lock_task_mm(task);
1477 if (p) { 1477 if (p) {
1478 curr = get_mem_cgroup_from_mm(p->mm); 1478 curr = get_mem_cgroup_from_mm(p->mm);
1479 task_unlock(p); 1479 task_unlock(p);
1480 } else { 1480 } else {
1481 /* 1481 /*
1482 * All threads may have already detached their mm's, but the oom 1482 * All threads may have already detached their mm's, but the oom
1483 * killer still needs to detect if they have already been oom 1483 * killer still needs to detect if they have already been oom
1484 * killed to prevent needlessly killing additional tasks. 1484 * killed to prevent needlessly killing additional tasks.
1485 */ 1485 */
1486 rcu_read_lock(); 1486 rcu_read_lock();
1487 curr = mem_cgroup_from_task(task); 1487 curr = mem_cgroup_from_task(task);
1488 if (curr) 1488 if (curr)
1489 css_get(&curr->css); 1489 css_get(&curr->css);
1490 rcu_read_unlock(); 1490 rcu_read_unlock();
1491 } 1491 }
1492 /* 1492 /*
1493 * We should check use_hierarchy of "memcg" not "curr". Because checking 1493 * We should check use_hierarchy of "memcg" not "curr". Because checking
1494 * use_hierarchy of "curr" here make this function true if hierarchy is 1494 * use_hierarchy of "curr" here make this function true if hierarchy is
1495 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1495 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
1496 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1496 * hierarchy(even if use_hierarchy is disabled in "memcg").
1497 */ 1497 */
1498 ret = mem_cgroup_same_or_subtree(memcg, curr); 1498 ret = mem_cgroup_same_or_subtree(memcg, curr);
1499 css_put(&curr->css); 1499 css_put(&curr->css);
1500 return ret; 1500 return ret;
1501 } 1501 }
1502 1502
1503 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1503 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1504 { 1504 {
1505 unsigned long inactive_ratio; 1505 unsigned long inactive_ratio;
1506 unsigned long inactive; 1506 unsigned long inactive;
1507 unsigned long active; 1507 unsigned long active;
1508 unsigned long gb; 1508 unsigned long gb;
1509 1509
1510 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1510 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1511 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1511 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1512 1512
1513 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1513 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1514 if (gb) 1514 if (gb)
1515 inactive_ratio = int_sqrt(10 * gb); 1515 inactive_ratio = int_sqrt(10 * gb);
1516 else 1516 else
1517 inactive_ratio = 1; 1517 inactive_ratio = 1;
1518 1518
1519 return inactive * inactive_ratio < active; 1519 return inactive * inactive_ratio < active;
1520 } 1520 }
1521 1521
1522 #define mem_cgroup_from_res_counter(counter, member) \ 1522 #define mem_cgroup_from_res_counter(counter, member) \
1523 container_of(counter, struct mem_cgroup, member) 1523 container_of(counter, struct mem_cgroup, member)
1524 1524
1525 /** 1525 /**
1526 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1526 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1527 * @memcg: the memory cgroup 1527 * @memcg: the memory cgroup
1528 * 1528 *
1529 * Returns the maximum amount of memory @mem can be charged with, in 1529 * Returns the maximum amount of memory @mem can be charged with, in
1530 * pages. 1530 * pages.
1531 */ 1531 */
1532 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1532 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1533 { 1533 {
1534 unsigned long long margin; 1534 unsigned long long margin;
1535 1535
1536 margin = res_counter_margin(&memcg->res); 1536 margin = res_counter_margin(&memcg->res);
1537 if (do_swap_account) 1537 if (do_swap_account)
1538 margin = min(margin, res_counter_margin(&memcg->memsw)); 1538 margin = min(margin, res_counter_margin(&memcg->memsw));
1539 return margin >> PAGE_SHIFT; 1539 return margin >> PAGE_SHIFT;
1540 } 1540 }
1541 1541
1542 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1542 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1543 { 1543 {
1544 /* root ? */ 1544 /* root ? */
1545 if (!css_parent(&memcg->css)) 1545 if (!css_parent(&memcg->css))
1546 return vm_swappiness; 1546 return vm_swappiness;
1547 1547
1548 return memcg->swappiness; 1548 return memcg->swappiness;
1549 } 1549 }
1550 1550
1551 /* 1551 /*
1552 * memcg->moving_account is used for checking possibility that some thread is 1552 * memcg->moving_account is used for checking possibility that some thread is
1553 * calling move_account(). When a thread on CPU-A starts moving pages under 1553 * calling move_account(). When a thread on CPU-A starts moving pages under
1554 * a memcg, other threads should check memcg->moving_account under 1554 * a memcg, other threads should check memcg->moving_account under
1555 * rcu_read_lock(), like this: 1555 * rcu_read_lock(), like this:
1556 * 1556 *
1557 * CPU-A CPU-B 1557 * CPU-A CPU-B
1558 * rcu_read_lock() 1558 * rcu_read_lock()
1559 * memcg->moving_account+1 if (memcg->mocing_account) 1559 * memcg->moving_account+1 if (memcg->mocing_account)
1560 * take heavy locks. 1560 * take heavy locks.
1561 * synchronize_rcu() update something. 1561 * synchronize_rcu() update something.
1562 * rcu_read_unlock() 1562 * rcu_read_unlock()
1563 * start move here. 1563 * start move here.
1564 */ 1564 */
1565 1565
1566 /* for quick checking without looking up memcg */ 1566 /* for quick checking without looking up memcg */
1567 atomic_t memcg_moving __read_mostly; 1567 atomic_t memcg_moving __read_mostly;
1568 1568
1569 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1569 static void mem_cgroup_start_move(struct mem_cgroup *memcg)
1570 { 1570 {
1571 atomic_inc(&memcg_moving); 1571 atomic_inc(&memcg_moving);
1572 atomic_inc(&memcg->moving_account); 1572 atomic_inc(&memcg->moving_account);
1573 synchronize_rcu(); 1573 synchronize_rcu();
1574 } 1574 }
1575 1575
1576 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1576 static void mem_cgroup_end_move(struct mem_cgroup *memcg)
1577 { 1577 {
1578 /* 1578 /*
1579 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1579 * Now, mem_cgroup_clear_mc() may call this function with NULL.
1580 * We check NULL in callee rather than caller. 1580 * We check NULL in callee rather than caller.
1581 */ 1581 */
1582 if (memcg) { 1582 if (memcg) {
1583 atomic_dec(&memcg_moving); 1583 atomic_dec(&memcg_moving);
1584 atomic_dec(&memcg->moving_account); 1584 atomic_dec(&memcg->moving_account);
1585 } 1585 }
1586 } 1586 }
1587 1587
1588 /* 1588 /*
1589 * 2 routines for checking "mem" is under move_account() or not. 1589 * 2 routines for checking "mem" is under move_account() or not.
1590 * 1590 *
1591 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1591 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This
1592 * is used for avoiding races in accounting. If true, 1592 * is used for avoiding races in accounting. If true,
1593 * pc->mem_cgroup may be overwritten. 1593 * pc->mem_cgroup may be overwritten.
1594 * 1594 *
1595 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1595 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1596 * under hierarchy of moving cgroups. This is for 1596 * under hierarchy of moving cgroups. This is for
1597 * waiting at hith-memory prressure caused by "move". 1597 * waiting at hith-memory prressure caused by "move".
1598 */ 1598 */
1599 1599
1600 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1600 static bool mem_cgroup_stolen(struct mem_cgroup *memcg)
1601 { 1601 {
1602 VM_BUG_ON(!rcu_read_lock_held()); 1602 VM_BUG_ON(!rcu_read_lock_held());
1603 return atomic_read(&memcg->moving_account) > 0; 1603 return atomic_read(&memcg->moving_account) > 0;
1604 } 1604 }
1605 1605
1606 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1606 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1607 { 1607 {
1608 struct mem_cgroup *from; 1608 struct mem_cgroup *from;
1609 struct mem_cgroup *to; 1609 struct mem_cgroup *to;
1610 bool ret = false; 1610 bool ret = false;
1611 /* 1611 /*
1612 * Unlike task_move routines, we access mc.to, mc.from not under 1612 * Unlike task_move routines, we access mc.to, mc.from not under
1613 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1613 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1614 */ 1614 */
1615 spin_lock(&mc.lock); 1615 spin_lock(&mc.lock);
1616 from = mc.from; 1616 from = mc.from;
1617 to = mc.to; 1617 to = mc.to;
1618 if (!from) 1618 if (!from)
1619 goto unlock; 1619 goto unlock;
1620 1620
1621 ret = mem_cgroup_same_or_subtree(memcg, from) 1621 ret = mem_cgroup_same_or_subtree(memcg, from)
1622 || mem_cgroup_same_or_subtree(memcg, to); 1622 || mem_cgroup_same_or_subtree(memcg, to);
1623 unlock: 1623 unlock:
1624 spin_unlock(&mc.lock); 1624 spin_unlock(&mc.lock);
1625 return ret; 1625 return ret;
1626 } 1626 }
1627 1627
1628 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1628 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1629 { 1629 {
1630 if (mc.moving_task && current != mc.moving_task) { 1630 if (mc.moving_task && current != mc.moving_task) {
1631 if (mem_cgroup_under_move(memcg)) { 1631 if (mem_cgroup_under_move(memcg)) {
1632 DEFINE_WAIT(wait); 1632 DEFINE_WAIT(wait);
1633 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1633 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1634 /* moving charge context might have finished. */ 1634 /* moving charge context might have finished. */
1635 if (mc.moving_task) 1635 if (mc.moving_task)
1636 schedule(); 1636 schedule();
1637 finish_wait(&mc.waitq, &wait); 1637 finish_wait(&mc.waitq, &wait);
1638 return true; 1638 return true;
1639 } 1639 }
1640 } 1640 }
1641 return false; 1641 return false;
1642 } 1642 }
1643 1643
1644 /* 1644 /*
1645 * Take this lock when 1645 * Take this lock when
1646 * - a code tries to modify page's memcg while it's USED. 1646 * - a code tries to modify page's memcg while it's USED.
1647 * - a code tries to modify page state accounting in a memcg. 1647 * - a code tries to modify page state accounting in a memcg.
1648 * see mem_cgroup_stolen(), too. 1648 * see mem_cgroup_stolen(), too.
1649 */ 1649 */
1650 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1650 static void move_lock_mem_cgroup(struct mem_cgroup *memcg,
1651 unsigned long *flags) 1651 unsigned long *flags)
1652 { 1652 {
1653 spin_lock_irqsave(&memcg->move_lock, *flags); 1653 spin_lock_irqsave(&memcg->move_lock, *flags);
1654 } 1654 }
1655 1655
1656 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1656 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
1657 unsigned long *flags) 1657 unsigned long *flags)
1658 { 1658 {
1659 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1659 spin_unlock_irqrestore(&memcg->move_lock, *flags);
1660 } 1660 }
1661 1661
1662 #define K(x) ((x) << (PAGE_SHIFT-10)) 1662 #define K(x) ((x) << (PAGE_SHIFT-10))
1663 /** 1663 /**
1664 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1664 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1665 * @memcg: The memory cgroup that went over limit 1665 * @memcg: The memory cgroup that went over limit
1666 * @p: Task that is going to be killed 1666 * @p: Task that is going to be killed
1667 * 1667 *
1668 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1668 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1669 * enabled 1669 * enabled
1670 */ 1670 */
1671 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1671 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1672 { 1672 {
1673 /* oom_info_lock ensures that parallel ooms do not interleave */ 1673 /* oom_info_lock ensures that parallel ooms do not interleave */
1674 static DEFINE_MUTEX(oom_info_lock); 1674 static DEFINE_MUTEX(oom_info_lock);
1675 struct mem_cgroup *iter; 1675 struct mem_cgroup *iter;
1676 unsigned int i; 1676 unsigned int i;
1677 1677
1678 if (!p) 1678 if (!p)
1679 return; 1679 return;
1680 1680
1681 mutex_lock(&oom_info_lock); 1681 mutex_lock(&oom_info_lock);
1682 rcu_read_lock(); 1682 rcu_read_lock();
1683 1683
1684 pr_info("Task in "); 1684 pr_info("Task in ");
1685 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); 1685 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1686 pr_info(" killed as a result of limit of "); 1686 pr_info(" killed as a result of limit of ");
1687 pr_cont_cgroup_path(memcg->css.cgroup); 1687 pr_cont_cgroup_path(memcg->css.cgroup);
1688 pr_info("\n"); 1688 pr_info("\n");
1689 1689
1690 rcu_read_unlock(); 1690 rcu_read_unlock();
1691 1691
1692 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1692 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n",
1693 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1693 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1694 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1694 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1695 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1695 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1696 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1696 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n",
1697 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1697 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1698 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1698 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1699 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1699 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1700 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1700 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n",
1701 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1701 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
1702 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1702 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
1703 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1703 res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
1704 1704
1705 for_each_mem_cgroup_tree(iter, memcg) { 1705 for_each_mem_cgroup_tree(iter, memcg) {
1706 pr_info("Memory cgroup stats for "); 1706 pr_info("Memory cgroup stats for ");
1707 pr_cont_cgroup_path(iter->css.cgroup); 1707 pr_cont_cgroup_path(iter->css.cgroup);
1708 pr_cont(":"); 1708 pr_cont(":");
1709 1709
1710 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1710 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1711 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1711 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1712 continue; 1712 continue;
1713 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1713 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1714 K(mem_cgroup_read_stat(iter, i))); 1714 K(mem_cgroup_read_stat(iter, i)));
1715 } 1715 }
1716 1716
1717 for (i = 0; i < NR_LRU_LISTS; i++) 1717 for (i = 0; i < NR_LRU_LISTS; i++)
1718 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1718 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1719 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1719 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1720 1720
1721 pr_cont("\n"); 1721 pr_cont("\n");
1722 } 1722 }
1723 mutex_unlock(&oom_info_lock); 1723 mutex_unlock(&oom_info_lock);
1724 } 1724 }
1725 1725
1726 /* 1726 /*
1727 * This function returns the number of memcg under hierarchy tree. Returns 1727 * This function returns the number of memcg under hierarchy tree. Returns
1728 * 1(self count) if no children. 1728 * 1(self count) if no children.
1729 */ 1729 */
1730 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1730 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1731 { 1731 {
1732 int num = 0; 1732 int num = 0;
1733 struct mem_cgroup *iter; 1733 struct mem_cgroup *iter;
1734 1734
1735 for_each_mem_cgroup_tree(iter, memcg) 1735 for_each_mem_cgroup_tree(iter, memcg)
1736 num++; 1736 num++;
1737 return num; 1737 return num;
1738 } 1738 }
1739 1739
1740 /* 1740 /*
1741 * Return the memory (and swap, if configured) limit for a memcg. 1741 * Return the memory (and swap, if configured) limit for a memcg.
1742 */ 1742 */
1743 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1743 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1744 { 1744 {
1745 u64 limit; 1745 u64 limit;
1746 1746
1747 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1747 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1748 1748
1749 /* 1749 /*
1750 * Do not consider swap space if we cannot swap due to swappiness 1750 * Do not consider swap space if we cannot swap due to swappiness
1751 */ 1751 */
1752 if (mem_cgroup_swappiness(memcg)) { 1752 if (mem_cgroup_swappiness(memcg)) {
1753 u64 memsw; 1753 u64 memsw;
1754 1754
1755 limit += total_swap_pages << PAGE_SHIFT; 1755 limit += total_swap_pages << PAGE_SHIFT;
1756 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1756 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1757 1757
1758 /* 1758 /*
1759 * If memsw is finite and limits the amount of swap space 1759 * If memsw is finite and limits the amount of swap space
1760 * available to this memcg, return that limit. 1760 * available to this memcg, return that limit.
1761 */ 1761 */
1762 limit = min(limit, memsw); 1762 limit = min(limit, memsw);
1763 } 1763 }
1764 1764
1765 return limit; 1765 return limit;
1766 } 1766 }
1767 1767
1768 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1768 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1769 int order) 1769 int order)
1770 { 1770 {
1771 struct mem_cgroup *iter; 1771 struct mem_cgroup *iter;
1772 unsigned long chosen_points = 0; 1772 unsigned long chosen_points = 0;
1773 unsigned long totalpages; 1773 unsigned long totalpages;
1774 unsigned int points = 0; 1774 unsigned int points = 0;
1775 struct task_struct *chosen = NULL; 1775 struct task_struct *chosen = NULL;
1776 1776
1777 /* 1777 /*
1778 * If current has a pending SIGKILL or is exiting, then automatically 1778 * If current has a pending SIGKILL or is exiting, then automatically
1779 * select it. The goal is to allow it to allocate so that it may 1779 * select it. The goal is to allow it to allocate so that it may
1780 * quickly exit and free its memory. 1780 * quickly exit and free its memory.
1781 */ 1781 */
1782 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1782 if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
1783 set_thread_flag(TIF_MEMDIE); 1783 set_thread_flag(TIF_MEMDIE);
1784 return; 1784 return;
1785 } 1785 }
1786 1786
1787 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1787 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
1788 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1788 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
1789 for_each_mem_cgroup_tree(iter, memcg) { 1789 for_each_mem_cgroup_tree(iter, memcg) {
1790 struct css_task_iter it; 1790 struct css_task_iter it;
1791 struct task_struct *task; 1791 struct task_struct *task;
1792 1792
1793 css_task_iter_start(&iter->css, &it); 1793 css_task_iter_start(&iter->css, &it);
1794 while ((task = css_task_iter_next(&it))) { 1794 while ((task = css_task_iter_next(&it))) {
1795 switch (oom_scan_process_thread(task, totalpages, NULL, 1795 switch (oom_scan_process_thread(task, totalpages, NULL,
1796 false)) { 1796 false)) {
1797 case OOM_SCAN_SELECT: 1797 case OOM_SCAN_SELECT:
1798 if (chosen) 1798 if (chosen)
1799 put_task_struct(chosen); 1799 put_task_struct(chosen);
1800 chosen = task; 1800 chosen = task;
1801 chosen_points = ULONG_MAX; 1801 chosen_points = ULONG_MAX;
1802 get_task_struct(chosen); 1802 get_task_struct(chosen);
1803 /* fall through */ 1803 /* fall through */
1804 case OOM_SCAN_CONTINUE: 1804 case OOM_SCAN_CONTINUE:
1805 continue; 1805 continue;
1806 case OOM_SCAN_ABORT: 1806 case OOM_SCAN_ABORT:
1807 css_task_iter_end(&it); 1807 css_task_iter_end(&it);
1808 mem_cgroup_iter_break(memcg, iter); 1808 mem_cgroup_iter_break(memcg, iter);
1809 if (chosen) 1809 if (chosen)
1810 put_task_struct(chosen); 1810 put_task_struct(chosen);
1811 return; 1811 return;
1812 case OOM_SCAN_OK: 1812 case OOM_SCAN_OK:
1813 break; 1813 break;
1814 }; 1814 };
1815 points = oom_badness(task, memcg, NULL, totalpages); 1815 points = oom_badness(task, memcg, NULL, totalpages);
1816 if (!points || points < chosen_points) 1816 if (!points || points < chosen_points)
1817 continue; 1817 continue;
1818 /* Prefer thread group leaders for display purposes */ 1818 /* Prefer thread group leaders for display purposes */
1819 if (points == chosen_points && 1819 if (points == chosen_points &&
1820 thread_group_leader(chosen)) 1820 thread_group_leader(chosen))
1821 continue; 1821 continue;
1822 1822
1823 if (chosen) 1823 if (chosen)
1824 put_task_struct(chosen); 1824 put_task_struct(chosen);
1825 chosen = task; 1825 chosen = task;
1826 chosen_points = points; 1826 chosen_points = points;
1827 get_task_struct(chosen); 1827 get_task_struct(chosen);
1828 } 1828 }
1829 css_task_iter_end(&it); 1829 css_task_iter_end(&it);
1830 } 1830 }
1831 1831
1832 if (!chosen) 1832 if (!chosen)
1833 return; 1833 return;
1834 points = chosen_points * 1000 / totalpages; 1834 points = chosen_points * 1000 / totalpages;
1835 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1835 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1836 NULL, "Memory cgroup out of memory"); 1836 NULL, "Memory cgroup out of memory");
1837 } 1837 }
1838 1838
1839 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1839 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1840 gfp_t gfp_mask, 1840 gfp_t gfp_mask,
1841 unsigned long flags) 1841 unsigned long flags)
1842 { 1842 {
1843 unsigned long total = 0; 1843 unsigned long total = 0;
1844 bool noswap = false; 1844 bool noswap = false;
1845 int loop; 1845 int loop;
1846 1846
1847 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1847 if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
1848 noswap = true; 1848 noswap = true;
1849 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1849 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
1850 noswap = true; 1850 noswap = true;
1851 1851
1852 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1852 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
1853 if (loop) 1853 if (loop)
1854 drain_all_stock_async(memcg); 1854 drain_all_stock_async(memcg);
1855 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1855 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
1856 /* 1856 /*
1857 * Allow limit shrinkers, which are triggered directly 1857 * Allow limit shrinkers, which are triggered directly
1858 * by userspace, to catch signals and stop reclaim 1858 * by userspace, to catch signals and stop reclaim
1859 * after minimal progress, regardless of the margin. 1859 * after minimal progress, regardless of the margin.
1860 */ 1860 */
1861 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1861 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
1862 break; 1862 break;
1863 if (mem_cgroup_margin(memcg)) 1863 if (mem_cgroup_margin(memcg))
1864 break; 1864 break;
1865 /* 1865 /*
1866 * If nothing was reclaimed after two attempts, there 1866 * If nothing was reclaimed after two attempts, there
1867 * may be no reclaimable pages in this hierarchy. 1867 * may be no reclaimable pages in this hierarchy.
1868 */ 1868 */
1869 if (loop && !total) 1869 if (loop && !total)
1870 break; 1870 break;
1871 } 1871 }
1872 return total; 1872 return total;
1873 } 1873 }
1874 1874
1875 /** 1875 /**
1876 * test_mem_cgroup_node_reclaimable 1876 * test_mem_cgroup_node_reclaimable
1877 * @memcg: the target memcg 1877 * @memcg: the target memcg
1878 * @nid: the node ID to be checked. 1878 * @nid: the node ID to be checked.
1879 * @noswap : specify true here if the user wants flle only information. 1879 * @noswap : specify true here if the user wants flle only information.
1880 * 1880 *
1881 * This function returns whether the specified memcg contains any 1881 * This function returns whether the specified memcg contains any
1882 * reclaimable pages on a node. Returns true if there are any reclaimable 1882 * reclaimable pages on a node. Returns true if there are any reclaimable
1883 * pages in the node. 1883 * pages in the node.
1884 */ 1884 */
1885 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1885 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1886 int nid, bool noswap) 1886 int nid, bool noswap)
1887 { 1887 {
1888 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1888 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1889 return true; 1889 return true;
1890 if (noswap || !total_swap_pages) 1890 if (noswap || !total_swap_pages)
1891 return false; 1891 return false;
1892 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1892 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1893 return true; 1893 return true;
1894 return false; 1894 return false;
1895 1895
1896 } 1896 }
1897 #if MAX_NUMNODES > 1 1897 #if MAX_NUMNODES > 1
1898 1898
1899 /* 1899 /*
1900 * Always updating the nodemask is not very good - even if we have an empty 1900 * Always updating the nodemask is not very good - even if we have an empty
1901 * list or the wrong list here, we can start from some node and traverse all 1901 * list or the wrong list here, we can start from some node and traverse all
1902 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1902 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1903 * 1903 *
1904 */ 1904 */
1905 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1905 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1906 { 1906 {
1907 int nid; 1907 int nid;
1908 /* 1908 /*
1909 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1909 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1910 * pagein/pageout changes since the last update. 1910 * pagein/pageout changes since the last update.
1911 */ 1911 */
1912 if (!atomic_read(&memcg->numainfo_events)) 1912 if (!atomic_read(&memcg->numainfo_events))
1913 return; 1913 return;
1914 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1914 if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1915 return; 1915 return;
1916 1916
1917 /* make a nodemask where this memcg uses memory from */ 1917 /* make a nodemask where this memcg uses memory from */
1918 memcg->scan_nodes = node_states[N_MEMORY]; 1918 memcg->scan_nodes = node_states[N_MEMORY];
1919 1919
1920 for_each_node_mask(nid, node_states[N_MEMORY]) { 1920 for_each_node_mask(nid, node_states[N_MEMORY]) {
1921 1921
1922 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1922 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1923 node_clear(nid, memcg->scan_nodes); 1923 node_clear(nid, memcg->scan_nodes);
1924 } 1924 }
1925 1925
1926 atomic_set(&memcg->numainfo_events, 0); 1926 atomic_set(&memcg->numainfo_events, 0);
1927 atomic_set(&memcg->numainfo_updating, 0); 1927 atomic_set(&memcg->numainfo_updating, 0);
1928 } 1928 }
1929 1929
1930 /* 1930 /*
1931 * Selecting a node where we start reclaim from. Because what we need is just 1931 * Selecting a node where we start reclaim from. Because what we need is just
1932 * reducing usage counter, start from anywhere is O,K. Considering 1932 * reducing usage counter, start from anywhere is O,K. Considering
1933 * memory reclaim from current node, there are pros. and cons. 1933 * memory reclaim from current node, there are pros. and cons.
1934 * 1934 *
1935 * Freeing memory from current node means freeing memory from a node which 1935 * Freeing memory from current node means freeing memory from a node which
1936 * we'll use or we've used. So, it may make LRU bad. And if several threads 1936 * we'll use or we've used. So, it may make LRU bad. And if several threads
1937 * hit limits, it will see a contention on a node. But freeing from remote 1937 * hit limits, it will see a contention on a node. But freeing from remote
1938 * node means more costs for memory reclaim because of memory latency. 1938 * node means more costs for memory reclaim because of memory latency.
1939 * 1939 *
1940 * Now, we use round-robin. Better algorithm is welcomed. 1940 * Now, we use round-robin. Better algorithm is welcomed.
1941 */ 1941 */
1942 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1942 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1943 { 1943 {
1944 int node; 1944 int node;
1945 1945
1946 mem_cgroup_may_update_nodemask(memcg); 1946 mem_cgroup_may_update_nodemask(memcg);
1947 node = memcg->last_scanned_node; 1947 node = memcg->last_scanned_node;
1948 1948
1949 node = next_node(node, memcg->scan_nodes); 1949 node = next_node(node, memcg->scan_nodes);
1950 if (node == MAX_NUMNODES) 1950 if (node == MAX_NUMNODES)
1951 node = first_node(memcg->scan_nodes); 1951 node = first_node(memcg->scan_nodes);
1952 /* 1952 /*
1953 * We call this when we hit limit, not when pages are added to LRU. 1953 * We call this when we hit limit, not when pages are added to LRU.
1954 * No LRU may hold pages because all pages are UNEVICTABLE or 1954 * No LRU may hold pages because all pages are UNEVICTABLE or
1955 * memcg is too small and all pages are not on LRU. In that case, 1955 * memcg is too small and all pages are not on LRU. In that case,
1956 * we use curret node. 1956 * we use curret node.
1957 */ 1957 */
1958 if (unlikely(node == MAX_NUMNODES)) 1958 if (unlikely(node == MAX_NUMNODES))
1959 node = numa_node_id(); 1959 node = numa_node_id();
1960 1960
1961 memcg->last_scanned_node = node; 1961 memcg->last_scanned_node = node;
1962 return node; 1962 return node;
1963 } 1963 }
1964 1964
1965 /* 1965 /*
1966 * Check all nodes whether it contains reclaimable pages or not. 1966 * Check all nodes whether it contains reclaimable pages or not.
1967 * For quick scan, we make use of scan_nodes. This will allow us to skip 1967 * For quick scan, we make use of scan_nodes. This will allow us to skip
1968 * unused nodes. But scan_nodes is lazily updated and may not cotain 1968 * unused nodes. But scan_nodes is lazily updated and may not cotain
1969 * enough new information. We need to do double check. 1969 * enough new information. We need to do double check.
1970 */ 1970 */
1971 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1971 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1972 { 1972 {
1973 int nid; 1973 int nid;
1974 1974
1975 /* 1975 /*
1976 * quick check...making use of scan_node. 1976 * quick check...making use of scan_node.
1977 * We can skip unused nodes. 1977 * We can skip unused nodes.
1978 */ 1978 */
1979 if (!nodes_empty(memcg->scan_nodes)) { 1979 if (!nodes_empty(memcg->scan_nodes)) {
1980 for (nid = first_node(memcg->scan_nodes); 1980 for (nid = first_node(memcg->scan_nodes);
1981 nid < MAX_NUMNODES; 1981 nid < MAX_NUMNODES;
1982 nid = next_node(nid, memcg->scan_nodes)) { 1982 nid = next_node(nid, memcg->scan_nodes)) {
1983 1983
1984 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1984 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1985 return true; 1985 return true;
1986 } 1986 }
1987 } 1987 }
1988 /* 1988 /*
1989 * Check rest of nodes. 1989 * Check rest of nodes.
1990 */ 1990 */
1991 for_each_node_state(nid, N_MEMORY) { 1991 for_each_node_state(nid, N_MEMORY) {
1992 if (node_isset(nid, memcg->scan_nodes)) 1992 if (node_isset(nid, memcg->scan_nodes))
1993 continue; 1993 continue;
1994 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 1994 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
1995 return true; 1995 return true;
1996 } 1996 }
1997 return false; 1997 return false;
1998 } 1998 }
1999 1999
2000 #else 2000 #else
2001 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 2001 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
2002 { 2002 {
2003 return 0; 2003 return 0;
2004 } 2004 }
2005 2005
2006 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 2006 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
2007 { 2007 {
2008 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 2008 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
2009 } 2009 }
2010 #endif 2010 #endif
2011 2011
2012 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 2012 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
2013 struct zone *zone, 2013 struct zone *zone,
2014 gfp_t gfp_mask, 2014 gfp_t gfp_mask,
2015 unsigned long *total_scanned) 2015 unsigned long *total_scanned)
2016 { 2016 {
2017 struct mem_cgroup *victim = NULL; 2017 struct mem_cgroup *victim = NULL;
2018 int total = 0; 2018 int total = 0;
2019 int loop = 0; 2019 int loop = 0;
2020 unsigned long excess; 2020 unsigned long excess;
2021 unsigned long nr_scanned; 2021 unsigned long nr_scanned;
2022 struct mem_cgroup_reclaim_cookie reclaim = { 2022 struct mem_cgroup_reclaim_cookie reclaim = {
2023 .zone = zone, 2023 .zone = zone,
2024 .priority = 0, 2024 .priority = 0,
2025 }; 2025 };
2026 2026
2027 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 2027 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
2028 2028
2029 while (1) { 2029 while (1) {
2030 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 2030 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
2031 if (!victim) { 2031 if (!victim) {
2032 loop++; 2032 loop++;
2033 if (loop >= 2) { 2033 if (loop >= 2) {
2034 /* 2034 /*
2035 * If we have not been able to reclaim 2035 * If we have not been able to reclaim
2036 * anything, it might because there are 2036 * anything, it might because there are
2037 * no reclaimable pages under this hierarchy 2037 * no reclaimable pages under this hierarchy
2038 */ 2038 */
2039 if (!total) 2039 if (!total)
2040 break; 2040 break;
2041 /* 2041 /*
2042 * We want to do more targeted reclaim. 2042 * We want to do more targeted reclaim.
2043 * excess >> 2 is not to excessive so as to 2043 * excess >> 2 is not to excessive so as to
2044 * reclaim too much, nor too less that we keep 2044 * reclaim too much, nor too less that we keep
2045 * coming back to reclaim from this cgroup 2045 * coming back to reclaim from this cgroup
2046 */ 2046 */
2047 if (total >= (excess >> 2) || 2047 if (total >= (excess >> 2) ||
2048 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 2048 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
2049 break; 2049 break;
2050 } 2050 }
2051 continue; 2051 continue;
2052 } 2052 }
2053 if (!mem_cgroup_reclaimable(victim, false)) 2053 if (!mem_cgroup_reclaimable(victim, false))
2054 continue; 2054 continue;
2055 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 2055 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
2056 zone, &nr_scanned); 2056 zone, &nr_scanned);
2057 *total_scanned += nr_scanned; 2057 *total_scanned += nr_scanned;
2058 if (!res_counter_soft_limit_excess(&root_memcg->res)) 2058 if (!res_counter_soft_limit_excess(&root_memcg->res))
2059 break; 2059 break;
2060 } 2060 }
2061 mem_cgroup_iter_break(root_memcg, victim); 2061 mem_cgroup_iter_break(root_memcg, victim);
2062 return total; 2062 return total;
2063 } 2063 }
2064 2064
2065 #ifdef CONFIG_LOCKDEP 2065 #ifdef CONFIG_LOCKDEP
2066 static struct lockdep_map memcg_oom_lock_dep_map = { 2066 static struct lockdep_map memcg_oom_lock_dep_map = {
2067 .name = "memcg_oom_lock", 2067 .name = "memcg_oom_lock",
2068 }; 2068 };
2069 #endif 2069 #endif
2070 2070
2071 static DEFINE_SPINLOCK(memcg_oom_lock); 2071 static DEFINE_SPINLOCK(memcg_oom_lock);
2072 2072
2073 /* 2073 /*
2074 * Check OOM-Killer is already running under our hierarchy. 2074 * Check OOM-Killer is already running under our hierarchy.
2075 * If someone is running, return false. 2075 * If someone is running, return false.
2076 */ 2076 */
2077 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 2077 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
2078 { 2078 {
2079 struct mem_cgroup *iter, *failed = NULL; 2079 struct mem_cgroup *iter, *failed = NULL;
2080 2080
2081 spin_lock(&memcg_oom_lock); 2081 spin_lock(&memcg_oom_lock);
2082 2082
2083 for_each_mem_cgroup_tree(iter, memcg) { 2083 for_each_mem_cgroup_tree(iter, memcg) {
2084 if (iter->oom_lock) { 2084 if (iter->oom_lock) {
2085 /* 2085 /*
2086 * this subtree of our hierarchy is already locked 2086 * this subtree of our hierarchy is already locked
2087 * so we cannot give a lock. 2087 * so we cannot give a lock.
2088 */ 2088 */
2089 failed = iter; 2089 failed = iter;
2090 mem_cgroup_iter_break(memcg, iter); 2090 mem_cgroup_iter_break(memcg, iter);
2091 break; 2091 break;
2092 } else 2092 } else
2093 iter->oom_lock = true; 2093 iter->oom_lock = true;
2094 } 2094 }
2095 2095
2096 if (failed) { 2096 if (failed) {
2097 /* 2097 /*
2098 * OK, we failed to lock the whole subtree so we have 2098 * OK, we failed to lock the whole subtree so we have
2099 * to clean up what we set up to the failing subtree 2099 * to clean up what we set up to the failing subtree
2100 */ 2100 */
2101 for_each_mem_cgroup_tree(iter, memcg) { 2101 for_each_mem_cgroup_tree(iter, memcg) {
2102 if (iter == failed) { 2102 if (iter == failed) {
2103 mem_cgroup_iter_break(memcg, iter); 2103 mem_cgroup_iter_break(memcg, iter);
2104 break; 2104 break;
2105 } 2105 }
2106 iter->oom_lock = false; 2106 iter->oom_lock = false;
2107 } 2107 }
2108 } else 2108 } else
2109 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 2109 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
2110 2110
2111 spin_unlock(&memcg_oom_lock); 2111 spin_unlock(&memcg_oom_lock);
2112 2112
2113 return !failed; 2113 return !failed;
2114 } 2114 }
2115 2115
2116 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2116 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
2117 { 2117 {
2118 struct mem_cgroup *iter; 2118 struct mem_cgroup *iter;
2119 2119
2120 spin_lock(&memcg_oom_lock); 2120 spin_lock(&memcg_oom_lock);
2121 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 2121 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
2122 for_each_mem_cgroup_tree(iter, memcg) 2122 for_each_mem_cgroup_tree(iter, memcg)
2123 iter->oom_lock = false; 2123 iter->oom_lock = false;
2124 spin_unlock(&memcg_oom_lock); 2124 spin_unlock(&memcg_oom_lock);
2125 } 2125 }
2126 2126
2127 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 2127 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
2128 { 2128 {
2129 struct mem_cgroup *iter; 2129 struct mem_cgroup *iter;
2130 2130
2131 for_each_mem_cgroup_tree(iter, memcg) 2131 for_each_mem_cgroup_tree(iter, memcg)
2132 atomic_inc(&iter->under_oom); 2132 atomic_inc(&iter->under_oom);
2133 } 2133 }
2134 2134
2135 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 2135 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
2136 { 2136 {
2137 struct mem_cgroup *iter; 2137 struct mem_cgroup *iter;
2138 2138
2139 /* 2139 /*
2140 * When a new child is created while the hierarchy is under oom, 2140 * When a new child is created while the hierarchy is under oom,
2141 * mem_cgroup_oom_lock() may not be called. We have to use 2141 * mem_cgroup_oom_lock() may not be called. We have to use
2142 * atomic_add_unless() here. 2142 * atomic_add_unless() here.
2143 */ 2143 */
2144 for_each_mem_cgroup_tree(iter, memcg) 2144 for_each_mem_cgroup_tree(iter, memcg)
2145 atomic_add_unless(&iter->under_oom, -1, 0); 2145 atomic_add_unless(&iter->under_oom, -1, 0);
2146 } 2146 }
2147 2147
2148 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 2148 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
2149 2149
2150 struct oom_wait_info { 2150 struct oom_wait_info {
2151 struct mem_cgroup *memcg; 2151 struct mem_cgroup *memcg;
2152 wait_queue_t wait; 2152 wait_queue_t wait;
2153 }; 2153 };
2154 2154
2155 static int memcg_oom_wake_function(wait_queue_t *wait, 2155 static int memcg_oom_wake_function(wait_queue_t *wait,
2156 unsigned mode, int sync, void *arg) 2156 unsigned mode, int sync, void *arg)
2157 { 2157 {
2158 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2158 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
2159 struct mem_cgroup *oom_wait_memcg; 2159 struct mem_cgroup *oom_wait_memcg;
2160 struct oom_wait_info *oom_wait_info; 2160 struct oom_wait_info *oom_wait_info;
2161 2161
2162 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2162 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
2163 oom_wait_memcg = oom_wait_info->memcg; 2163 oom_wait_memcg = oom_wait_info->memcg;
2164 2164
2165 /* 2165 /*
2166 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2166 * Both of oom_wait_info->memcg and wake_memcg are stable under us.
2167 * Then we can use css_is_ancestor without taking care of RCU. 2167 * Then we can use css_is_ancestor without taking care of RCU.
2168 */ 2168 */
2169 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2169 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
2170 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2170 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
2171 return 0; 2171 return 0;
2172 return autoremove_wake_function(wait, mode, sync, arg); 2172 return autoremove_wake_function(wait, mode, sync, arg);
2173 } 2173 }
2174 2174
2175 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2175 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2176 { 2176 {
2177 atomic_inc(&memcg->oom_wakeups); 2177 atomic_inc(&memcg->oom_wakeups);
2178 /* for filtering, pass "memcg" as argument. */ 2178 /* for filtering, pass "memcg" as argument. */
2179 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2179 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2180 } 2180 }
2181 2181
2182 static void memcg_oom_recover(struct mem_cgroup *memcg) 2182 static void memcg_oom_recover(struct mem_cgroup *memcg)
2183 { 2183 {
2184 if (memcg && atomic_read(&memcg->under_oom)) 2184 if (memcg && atomic_read(&memcg->under_oom))
2185 memcg_wakeup_oom(memcg); 2185 memcg_wakeup_oom(memcg);
2186 } 2186 }
2187 2187
2188 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2188 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2189 { 2189 {
2190 if (!current->memcg_oom.may_oom) 2190 if (!current->memcg_oom.may_oom)
2191 return; 2191 return;
2192 /* 2192 /*
2193 * We are in the middle of the charge context here, so we 2193 * We are in the middle of the charge context here, so we
2194 * don't want to block when potentially sitting on a callstack 2194 * don't want to block when potentially sitting on a callstack
2195 * that holds all kinds of filesystem and mm locks. 2195 * that holds all kinds of filesystem and mm locks.
2196 * 2196 *
2197 * Also, the caller may handle a failed allocation gracefully 2197 * Also, the caller may handle a failed allocation gracefully
2198 * (like optional page cache readahead) and so an OOM killer 2198 * (like optional page cache readahead) and so an OOM killer
2199 * invocation might not even be necessary. 2199 * invocation might not even be necessary.
2200 * 2200 *
2201 * That's why we don't do anything here except remember the 2201 * That's why we don't do anything here except remember the
2202 * OOM context and then deal with it at the end of the page 2202 * OOM context and then deal with it at the end of the page
2203 * fault when the stack is unwound, the locks are released, 2203 * fault when the stack is unwound, the locks are released,
2204 * and when we know whether the fault was overall successful. 2204 * and when we know whether the fault was overall successful.
2205 */ 2205 */
2206 css_get(&memcg->css); 2206 css_get(&memcg->css);
2207 current->memcg_oom.memcg = memcg; 2207 current->memcg_oom.memcg = memcg;
2208 current->memcg_oom.gfp_mask = mask; 2208 current->memcg_oom.gfp_mask = mask;
2209 current->memcg_oom.order = order; 2209 current->memcg_oom.order = order;
2210 } 2210 }
2211 2211
2212 /** 2212 /**
2213 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2213 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2214 * @handle: actually kill/wait or just clean up the OOM state 2214 * @handle: actually kill/wait or just clean up the OOM state
2215 * 2215 *
2216 * This has to be called at the end of a page fault if the memcg OOM 2216 * This has to be called at the end of a page fault if the memcg OOM
2217 * handler was enabled. 2217 * handler was enabled.
2218 * 2218 *
2219 * Memcg supports userspace OOM handling where failed allocations must 2219 * Memcg supports userspace OOM handling where failed allocations must
2220 * sleep on a waitqueue until the userspace task resolves the 2220 * sleep on a waitqueue until the userspace task resolves the
2221 * situation. Sleeping directly in the charge context with all kinds 2221 * situation. Sleeping directly in the charge context with all kinds
2222 * of locks held is not a good idea, instead we remember an OOM state 2222 * of locks held is not a good idea, instead we remember an OOM state
2223 * in the task and mem_cgroup_oom_synchronize() has to be called at 2223 * in the task and mem_cgroup_oom_synchronize() has to be called at
2224 * the end of the page fault to complete the OOM handling. 2224 * the end of the page fault to complete the OOM handling.
2225 * 2225 *
2226 * Returns %true if an ongoing memcg OOM situation was detected and 2226 * Returns %true if an ongoing memcg OOM situation was detected and
2227 * completed, %false otherwise. 2227 * completed, %false otherwise.
2228 */ 2228 */
2229 bool mem_cgroup_oom_synchronize(bool handle) 2229 bool mem_cgroup_oom_synchronize(bool handle)
2230 { 2230 {
2231 struct mem_cgroup *memcg = current->memcg_oom.memcg; 2231 struct mem_cgroup *memcg = current->memcg_oom.memcg;
2232 struct oom_wait_info owait; 2232 struct oom_wait_info owait;
2233 bool locked; 2233 bool locked;
2234 2234
2235 /* OOM is global, do not handle */ 2235 /* OOM is global, do not handle */
2236 if (!memcg) 2236 if (!memcg)
2237 return false; 2237 return false;
2238 2238
2239 if (!handle) 2239 if (!handle)
2240 goto cleanup; 2240 goto cleanup;
2241 2241
2242 owait.memcg = memcg; 2242 owait.memcg = memcg;
2243 owait.wait.flags = 0; 2243 owait.wait.flags = 0;
2244 owait.wait.func = memcg_oom_wake_function; 2244 owait.wait.func = memcg_oom_wake_function;
2245 owait.wait.private = current; 2245 owait.wait.private = current;
2246 INIT_LIST_HEAD(&owait.wait.task_list); 2246 INIT_LIST_HEAD(&owait.wait.task_list);
2247 2247
2248 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2248 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2249 mem_cgroup_mark_under_oom(memcg); 2249 mem_cgroup_mark_under_oom(memcg);
2250 2250
2251 locked = mem_cgroup_oom_trylock(memcg); 2251 locked = mem_cgroup_oom_trylock(memcg);
2252 2252
2253 if (locked) 2253 if (locked)
2254 mem_cgroup_oom_notify(memcg); 2254 mem_cgroup_oom_notify(memcg);
2255 2255
2256 if (locked && !memcg->oom_kill_disable) { 2256 if (locked && !memcg->oom_kill_disable) {
2257 mem_cgroup_unmark_under_oom(memcg); 2257 mem_cgroup_unmark_under_oom(memcg);
2258 finish_wait(&memcg_oom_waitq, &owait.wait); 2258 finish_wait(&memcg_oom_waitq, &owait.wait);
2259 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2259 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
2260 current->memcg_oom.order); 2260 current->memcg_oom.order);
2261 } else { 2261 } else {
2262 schedule(); 2262 schedule();
2263 mem_cgroup_unmark_under_oom(memcg); 2263 mem_cgroup_unmark_under_oom(memcg);
2264 finish_wait(&memcg_oom_waitq, &owait.wait); 2264 finish_wait(&memcg_oom_waitq, &owait.wait);
2265 } 2265 }
2266 2266
2267 if (locked) { 2267 if (locked) {
2268 mem_cgroup_oom_unlock(memcg); 2268 mem_cgroup_oom_unlock(memcg);
2269 /* 2269 /*
2270 * There is no guarantee that an OOM-lock contender 2270 * There is no guarantee that an OOM-lock contender
2271 * sees the wakeups triggered by the OOM kill 2271 * sees the wakeups triggered by the OOM kill
2272 * uncharges. Wake any sleepers explicitely. 2272 * uncharges. Wake any sleepers explicitely.
2273 */ 2273 */
2274 memcg_oom_recover(memcg); 2274 memcg_oom_recover(memcg);
2275 } 2275 }
2276 cleanup: 2276 cleanup:
2277 current->memcg_oom.memcg = NULL; 2277 current->memcg_oom.memcg = NULL;
2278 css_put(&memcg->css); 2278 css_put(&memcg->css);
2279 return true; 2279 return true;
2280 } 2280 }
2281 2281
2282 /* 2282 /*
2283 * Currently used to update mapped file statistics, but the routine can be 2283 * Currently used to update mapped file statistics, but the routine can be
2284 * generalized to update other statistics as well. 2284 * generalized to update other statistics as well.
2285 * 2285 *
2286 * Notes: Race condition 2286 * Notes: Race condition
2287 * 2287 *
2288 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2288 * We usually use page_cgroup_lock() for accessing page_cgroup member but
2289 * it tends to be costly. But considering some conditions, we doesn't need 2289 * it tends to be costly. But considering some conditions, we doesn't need
2290 * to do so _always_. 2290 * to do so _always_.
2291 * 2291 *
2292 * Considering "charge", lock_page_cgroup() is not required because all 2292 * Considering "charge", lock_page_cgroup() is not required because all
2293 * file-stat operations happen after a page is attached to radix-tree. There 2293 * file-stat operations happen after a page is attached to radix-tree. There
2294 * are no race with "charge". 2294 * are no race with "charge".
2295 * 2295 *
2296 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2296 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
2297 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 2297 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
2298 * if there are race with "uncharge". Statistics itself is properly handled 2298 * if there are race with "uncharge". Statistics itself is properly handled
2299 * by flags. 2299 * by flags.
2300 * 2300 *
2301 * Considering "move", this is an only case we see a race. To make the race 2301 * Considering "move", this is an only case we see a race. To make the race
2302 * small, we check mm->moving_account and detect there are possibility of race 2302 * small, we check mm->moving_account and detect there are possibility of race
2303 * If there is, we take a lock. 2303 * If there is, we take a lock.
2304 */ 2304 */
2305 2305
2306 void __mem_cgroup_begin_update_page_stat(struct page *page, 2306 void __mem_cgroup_begin_update_page_stat(struct page *page,
2307 bool *locked, unsigned long *flags) 2307 bool *locked, unsigned long *flags)
2308 { 2308 {
2309 struct mem_cgroup *memcg; 2309 struct mem_cgroup *memcg;
2310 struct page_cgroup *pc; 2310 struct page_cgroup *pc;
2311 2311
2312 pc = lookup_page_cgroup(page); 2312 pc = lookup_page_cgroup(page);
2313 again: 2313 again:
2314 memcg = pc->mem_cgroup; 2314 memcg = pc->mem_cgroup;
2315 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2315 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2316 return; 2316 return;
2317 /* 2317 /*
2318 * If this memory cgroup is not under account moving, we don't 2318 * If this memory cgroup is not under account moving, we don't
2319 * need to take move_lock_mem_cgroup(). Because we already hold 2319 * need to take move_lock_mem_cgroup(). Because we already hold
2320 * rcu_read_lock(), any calls to move_account will be delayed until 2320 * rcu_read_lock(), any calls to move_account will be delayed until
2321 * rcu_read_unlock() if mem_cgroup_stolen() == true. 2321 * rcu_read_unlock() if mem_cgroup_stolen() == true.
2322 */ 2322 */
2323 if (!mem_cgroup_stolen(memcg)) 2323 if (!mem_cgroup_stolen(memcg))
2324 return; 2324 return;
2325 2325
2326 move_lock_mem_cgroup(memcg, flags); 2326 move_lock_mem_cgroup(memcg, flags);
2327 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2327 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
2328 move_unlock_mem_cgroup(memcg, flags); 2328 move_unlock_mem_cgroup(memcg, flags);
2329 goto again; 2329 goto again;
2330 } 2330 }
2331 *locked = true; 2331 *locked = true;
2332 } 2332 }
2333 2333
2334 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2334 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
2335 { 2335 {
2336 struct page_cgroup *pc = lookup_page_cgroup(page); 2336 struct page_cgroup *pc = lookup_page_cgroup(page);
2337 2337
2338 /* 2338 /*
2339 * It's guaranteed that pc->mem_cgroup never changes while 2339 * It's guaranteed that pc->mem_cgroup never changes while
2340 * lock is held because a routine modifies pc->mem_cgroup 2340 * lock is held because a routine modifies pc->mem_cgroup
2341 * should take move_lock_mem_cgroup(). 2341 * should take move_lock_mem_cgroup().
2342 */ 2342 */
2343 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 2343 move_unlock_mem_cgroup(pc->mem_cgroup, flags);
2344 } 2344 }
2345 2345
2346 void mem_cgroup_update_page_stat(struct page *page, 2346 void mem_cgroup_update_page_stat(struct page *page,
2347 enum mem_cgroup_stat_index idx, int val) 2347 enum mem_cgroup_stat_index idx, int val)
2348 { 2348 {
2349 struct mem_cgroup *memcg; 2349 struct mem_cgroup *memcg;
2350 struct page_cgroup *pc = lookup_page_cgroup(page); 2350 struct page_cgroup *pc = lookup_page_cgroup(page);
2351 unsigned long uninitialized_var(flags); 2351 unsigned long uninitialized_var(flags);
2352 2352
2353 if (mem_cgroup_disabled()) 2353 if (mem_cgroup_disabled())
2354 return; 2354 return;
2355 2355
2356 VM_BUG_ON(!rcu_read_lock_held()); 2356 VM_BUG_ON(!rcu_read_lock_held());
2357 memcg = pc->mem_cgroup; 2357 memcg = pc->mem_cgroup;
2358 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2358 if (unlikely(!memcg || !PageCgroupUsed(pc)))
2359 return; 2359 return;
2360 2360
2361 this_cpu_add(memcg->stat->count[idx], val); 2361 this_cpu_add(memcg->stat->count[idx], val);
2362 } 2362 }
2363 2363
2364 /* 2364 /*
2365 * size of first charge trial. "32" comes from vmscan.c's magic value. 2365 * size of first charge trial. "32" comes from vmscan.c's magic value.
2366 * TODO: maybe necessary to use big numbers in big irons. 2366 * TODO: maybe necessary to use big numbers in big irons.
2367 */ 2367 */
2368 #define CHARGE_BATCH 32U 2368 #define CHARGE_BATCH 32U
2369 struct memcg_stock_pcp { 2369 struct memcg_stock_pcp {
2370 struct mem_cgroup *cached; /* this never be root cgroup */ 2370 struct mem_cgroup *cached; /* this never be root cgroup */
2371 unsigned int nr_pages; 2371 unsigned int nr_pages;
2372 struct work_struct work; 2372 struct work_struct work;
2373 unsigned long flags; 2373 unsigned long flags;
2374 #define FLUSHING_CACHED_CHARGE 0 2374 #define FLUSHING_CACHED_CHARGE 0
2375 }; 2375 };
2376 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2376 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2377 static DEFINE_MUTEX(percpu_charge_mutex); 2377 static DEFINE_MUTEX(percpu_charge_mutex);
2378 2378
2379 /** 2379 /**
2380 * consume_stock: Try to consume stocked charge on this cpu. 2380 * consume_stock: Try to consume stocked charge on this cpu.
2381 * @memcg: memcg to consume from. 2381 * @memcg: memcg to consume from.
2382 * @nr_pages: how many pages to charge. 2382 * @nr_pages: how many pages to charge.
2383 * 2383 *
2384 * The charges will only happen if @memcg matches the current cpu's memcg 2384 * The charges will only happen if @memcg matches the current cpu's memcg
2385 * stock, and at least @nr_pages are available in that stock. Failure to 2385 * stock, and at least @nr_pages are available in that stock. Failure to
2386 * service an allocation will refill the stock. 2386 * service an allocation will refill the stock.
2387 * 2387 *
2388 * returns true if successful, false otherwise. 2388 * returns true if successful, false otherwise.
2389 */ 2389 */
2390 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2390 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2391 { 2391 {
2392 struct memcg_stock_pcp *stock; 2392 struct memcg_stock_pcp *stock;
2393 bool ret = true; 2393 bool ret = true;
2394 2394
2395 if (nr_pages > CHARGE_BATCH) 2395 if (nr_pages > CHARGE_BATCH)
2396 return false; 2396 return false;
2397 2397
2398 stock = &get_cpu_var(memcg_stock); 2398 stock = &get_cpu_var(memcg_stock);
2399 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2399 if (memcg == stock->cached && stock->nr_pages >= nr_pages)
2400 stock->nr_pages -= nr_pages; 2400 stock->nr_pages -= nr_pages;
2401 else /* need to call res_counter_charge */ 2401 else /* need to call res_counter_charge */
2402 ret = false; 2402 ret = false;
2403 put_cpu_var(memcg_stock); 2403 put_cpu_var(memcg_stock);
2404 return ret; 2404 return ret;
2405 } 2405 }
2406 2406
2407 /* 2407 /*
2408 * Returns stocks cached in percpu to res_counter and reset cached information. 2408 * Returns stocks cached in percpu to res_counter and reset cached information.
2409 */ 2409 */
2410 static void drain_stock(struct memcg_stock_pcp *stock) 2410 static void drain_stock(struct memcg_stock_pcp *stock)
2411 { 2411 {
2412 struct mem_cgroup *old = stock->cached; 2412 struct mem_cgroup *old = stock->cached;
2413 2413
2414 if (stock->nr_pages) { 2414 if (stock->nr_pages) {
2415 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2415 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2416 2416
2417 res_counter_uncharge(&old->res, bytes); 2417 res_counter_uncharge(&old->res, bytes);
2418 if (do_swap_account) 2418 if (do_swap_account)
2419 res_counter_uncharge(&old->memsw, bytes); 2419 res_counter_uncharge(&old->memsw, bytes);
2420 stock->nr_pages = 0; 2420 stock->nr_pages = 0;
2421 } 2421 }
2422 stock->cached = NULL; 2422 stock->cached = NULL;
2423 } 2423 }
2424 2424
2425 /* 2425 /*
2426 * This must be called under preempt disabled or must be called by 2426 * This must be called under preempt disabled or must be called by
2427 * a thread which is pinned to local cpu. 2427 * a thread which is pinned to local cpu.
2428 */ 2428 */
2429 static void drain_local_stock(struct work_struct *dummy) 2429 static void drain_local_stock(struct work_struct *dummy)
2430 { 2430 {
2431 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2431 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2432 drain_stock(stock); 2432 drain_stock(stock);
2433 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2433 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2434 } 2434 }
2435 2435
2436 static void __init memcg_stock_init(void) 2436 static void __init memcg_stock_init(void)
2437 { 2437 {
2438 int cpu; 2438 int cpu;
2439 2439
2440 for_each_possible_cpu(cpu) { 2440 for_each_possible_cpu(cpu) {
2441 struct memcg_stock_pcp *stock = 2441 struct memcg_stock_pcp *stock =
2442 &per_cpu(memcg_stock, cpu); 2442 &per_cpu(memcg_stock, cpu);
2443 INIT_WORK(&stock->work, drain_local_stock); 2443 INIT_WORK(&stock->work, drain_local_stock);
2444 } 2444 }
2445 } 2445 }
2446 2446
2447 /* 2447 /*
2448 * Cache charges(val) which is from res_counter, to local per_cpu area. 2448 * Cache charges(val) which is from res_counter, to local per_cpu area.
2449 * This will be consumed by consume_stock() function, later. 2449 * This will be consumed by consume_stock() function, later.
2450 */ 2450 */
2451 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2451 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2452 { 2452 {
2453 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2453 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2454 2454
2455 if (stock->cached != memcg) { /* reset if necessary */ 2455 if (stock->cached != memcg) { /* reset if necessary */
2456 drain_stock(stock); 2456 drain_stock(stock);
2457 stock->cached = memcg; 2457 stock->cached = memcg;
2458 } 2458 }
2459 stock->nr_pages += nr_pages; 2459 stock->nr_pages += nr_pages;
2460 put_cpu_var(memcg_stock); 2460 put_cpu_var(memcg_stock);
2461 } 2461 }
2462 2462
2463 /* 2463 /*
2464 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2464 * Drains all per-CPU charge caches for given root_memcg resp. subtree
2465 * of the hierarchy under it. sync flag says whether we should block 2465 * of the hierarchy under it. sync flag says whether we should block
2466 * until the work is done. 2466 * until the work is done.
2467 */ 2467 */
2468 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2468 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
2469 { 2469 {
2470 int cpu, curcpu; 2470 int cpu, curcpu;
2471 2471
2472 /* Notify other cpus that system-wide "drain" is running */ 2472 /* Notify other cpus that system-wide "drain" is running */
2473 get_online_cpus(); 2473 get_online_cpus();
2474 curcpu = get_cpu(); 2474 curcpu = get_cpu();
2475 for_each_online_cpu(cpu) { 2475 for_each_online_cpu(cpu) {
2476 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2476 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2477 struct mem_cgroup *memcg; 2477 struct mem_cgroup *memcg;
2478 2478
2479 memcg = stock->cached; 2479 memcg = stock->cached;
2480 if (!memcg || !stock->nr_pages) 2480 if (!memcg || !stock->nr_pages)
2481 continue; 2481 continue;
2482 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2482 if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
2483 continue; 2483 continue;
2484 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2484 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2485 if (cpu == curcpu) 2485 if (cpu == curcpu)
2486 drain_local_stock(&stock->work); 2486 drain_local_stock(&stock->work);
2487 else 2487 else
2488 schedule_work_on(cpu, &stock->work); 2488 schedule_work_on(cpu, &stock->work);
2489 } 2489 }
2490 } 2490 }
2491 put_cpu(); 2491 put_cpu();
2492 2492
2493 if (!sync) 2493 if (!sync)
2494 goto out; 2494 goto out;
2495 2495
2496 for_each_online_cpu(cpu) { 2496 for_each_online_cpu(cpu) {
2497 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2497 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2498 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2498 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2499 flush_work(&stock->work); 2499 flush_work(&stock->work);
2500 } 2500 }
2501 out: 2501 out:
2502 put_online_cpus(); 2502 put_online_cpus();
2503 } 2503 }
2504 2504
2505 /* 2505 /*
2506 * Tries to drain stocked charges in other cpus. This function is asynchronous 2506 * Tries to drain stocked charges in other cpus. This function is asynchronous
2507 * and just put a work per cpu for draining localy on each cpu. Caller can 2507 * and just put a work per cpu for draining localy on each cpu. Caller can
2508 * expects some charges will be back to res_counter later but cannot wait for 2508 * expects some charges will be back to res_counter later but cannot wait for
2509 * it. 2509 * it.
2510 */ 2510 */
2511 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2511 static void drain_all_stock_async(struct mem_cgroup *root_memcg)
2512 { 2512 {
2513 /* 2513 /*
2514 * If someone calls draining, avoid adding more kworker runs. 2514 * If someone calls draining, avoid adding more kworker runs.
2515 */ 2515 */
2516 if (!mutex_trylock(&percpu_charge_mutex)) 2516 if (!mutex_trylock(&percpu_charge_mutex))
2517 return; 2517 return;
2518 drain_all_stock(root_memcg, false); 2518 drain_all_stock(root_memcg, false);
2519 mutex_unlock(&percpu_charge_mutex); 2519 mutex_unlock(&percpu_charge_mutex);
2520 } 2520 }
2521 2521
2522 /* This is a synchronous drain interface. */ 2522 /* This is a synchronous drain interface. */
2523 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2523 static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
2524 { 2524 {
2525 /* called when force_empty is called */ 2525 /* called when force_empty is called */
2526 mutex_lock(&percpu_charge_mutex); 2526 mutex_lock(&percpu_charge_mutex);
2527 drain_all_stock(root_memcg, true); 2527 drain_all_stock(root_memcg, true);
2528 mutex_unlock(&percpu_charge_mutex); 2528 mutex_unlock(&percpu_charge_mutex);
2529 } 2529 }
2530 2530
2531 /* 2531 /*
2532 * This function drains percpu counter value from DEAD cpu and 2532 * This function drains percpu counter value from DEAD cpu and
2533 * move it to local cpu. Note that this function can be preempted. 2533 * move it to local cpu. Note that this function can be preempted.
2534 */ 2534 */
2535 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2535 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2536 { 2536 {
2537 int i; 2537 int i;
2538 2538
2539 spin_lock(&memcg->pcp_counter_lock); 2539 spin_lock(&memcg->pcp_counter_lock);
2540 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2540 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2541 long x = per_cpu(memcg->stat->count[i], cpu); 2541 long x = per_cpu(memcg->stat->count[i], cpu);
2542 2542
2543 per_cpu(memcg->stat->count[i], cpu) = 0; 2543 per_cpu(memcg->stat->count[i], cpu) = 0;
2544 memcg->nocpu_base.count[i] += x; 2544 memcg->nocpu_base.count[i] += x;
2545 } 2545 }
2546 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2546 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2547 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2547 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2548 2548
2549 per_cpu(memcg->stat->events[i], cpu) = 0; 2549 per_cpu(memcg->stat->events[i], cpu) = 0;
2550 memcg->nocpu_base.events[i] += x; 2550 memcg->nocpu_base.events[i] += x;
2551 } 2551 }
2552 spin_unlock(&memcg->pcp_counter_lock); 2552 spin_unlock(&memcg->pcp_counter_lock);
2553 } 2553 }
2554 2554
2555 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2555 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2556 unsigned long action, 2556 unsigned long action,
2557 void *hcpu) 2557 void *hcpu)
2558 { 2558 {
2559 int cpu = (unsigned long)hcpu; 2559 int cpu = (unsigned long)hcpu;
2560 struct memcg_stock_pcp *stock; 2560 struct memcg_stock_pcp *stock;
2561 struct mem_cgroup *iter; 2561 struct mem_cgroup *iter;
2562 2562
2563 if (action == CPU_ONLINE) 2563 if (action == CPU_ONLINE)
2564 return NOTIFY_OK; 2564 return NOTIFY_OK;
2565 2565
2566 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2566 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2567 return NOTIFY_OK; 2567 return NOTIFY_OK;
2568 2568
2569 for_each_mem_cgroup(iter) 2569 for_each_mem_cgroup(iter)
2570 mem_cgroup_drain_pcp_counter(iter, cpu); 2570 mem_cgroup_drain_pcp_counter(iter, cpu);
2571 2571
2572 stock = &per_cpu(memcg_stock, cpu); 2572 stock = &per_cpu(memcg_stock, cpu);
2573 drain_stock(stock); 2573 drain_stock(stock);
2574 return NOTIFY_OK; 2574 return NOTIFY_OK;
2575 } 2575 }
2576 2576
2577 2577
2578 /* See mem_cgroup_try_charge() for details */ 2578 /* See mem_cgroup_try_charge() for details */
2579 enum { 2579 enum {
2580 CHARGE_OK, /* success */ 2580 CHARGE_OK, /* success */
2581 CHARGE_RETRY, /* need to retry but retry is not bad */ 2581 CHARGE_RETRY, /* need to retry but retry is not bad */
2582 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2582 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
2583 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2583 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
2584 }; 2584 };
2585 2585
2586 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2586 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2587 unsigned int nr_pages, unsigned int min_pages, 2587 unsigned int nr_pages, unsigned int min_pages,
2588 bool invoke_oom) 2588 bool invoke_oom)
2589 { 2589 {
2590 unsigned long csize = nr_pages * PAGE_SIZE; 2590 unsigned long csize = nr_pages * PAGE_SIZE;
2591 struct mem_cgroup *mem_over_limit; 2591 struct mem_cgroup *mem_over_limit;
2592 struct res_counter *fail_res; 2592 struct res_counter *fail_res;
2593 unsigned long flags = 0; 2593 unsigned long flags = 0;
2594 int ret; 2594 int ret;
2595 2595
2596 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2596 ret = res_counter_charge(&memcg->res, csize, &fail_res);
2597 2597
2598 if (likely(!ret)) { 2598 if (likely(!ret)) {
2599 if (!do_swap_account) 2599 if (!do_swap_account)
2600 return CHARGE_OK; 2600 return CHARGE_OK;
2601 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2601 ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
2602 if (likely(!ret)) 2602 if (likely(!ret))
2603 return CHARGE_OK; 2603 return CHARGE_OK;
2604 2604
2605 res_counter_uncharge(&memcg->res, csize); 2605 res_counter_uncharge(&memcg->res, csize);
2606 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2606 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2607 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2607 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2608 } else 2608 } else
2609 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2609 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2610 /* 2610 /*
2611 * Never reclaim on behalf of optional batching, retry with a 2611 * Never reclaim on behalf of optional batching, retry with a
2612 * single page instead. 2612 * single page instead.
2613 */ 2613 */
2614 if (nr_pages > min_pages) 2614 if (nr_pages > min_pages)
2615 return CHARGE_RETRY; 2615 return CHARGE_RETRY;
2616 2616
2617 if (!(gfp_mask & __GFP_WAIT)) 2617 if (!(gfp_mask & __GFP_WAIT))
2618 return CHARGE_WOULDBLOCK; 2618 return CHARGE_WOULDBLOCK;
2619 2619
2620 if (gfp_mask & __GFP_NORETRY) 2620 if (gfp_mask & __GFP_NORETRY)
2621 return CHARGE_NOMEM; 2621 return CHARGE_NOMEM;
2622 2622
2623 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2623 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
2624 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2624 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2625 return CHARGE_RETRY; 2625 return CHARGE_RETRY;
2626 /* 2626 /*
2627 * Even though the limit is exceeded at this point, reclaim 2627 * Even though the limit is exceeded at this point, reclaim
2628 * may have been able to free some pages. Retry the charge 2628 * may have been able to free some pages. Retry the charge
2629 * before killing the task. 2629 * before killing the task.
2630 * 2630 *
2631 * Only for regular pages, though: huge pages are rather 2631 * Only for regular pages, though: huge pages are rather
2632 * unlikely to succeed so close to the limit, and we fall back 2632 * unlikely to succeed so close to the limit, and we fall back
2633 * to regular pages anyway in case of failure. 2633 * to regular pages anyway in case of failure.
2634 */ 2634 */
2635 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) 2635 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
2636 return CHARGE_RETRY; 2636 return CHARGE_RETRY;
2637 2637
2638 /* 2638 /*
2639 * At task move, charge accounts can be doubly counted. So, it's 2639 * At task move, charge accounts can be doubly counted. So, it's
2640 * better to wait until the end of task_move if something is going on. 2640 * better to wait until the end of task_move if something is going on.
2641 */ 2641 */
2642 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2642 if (mem_cgroup_wait_acct_move(mem_over_limit))
2643 return CHARGE_RETRY; 2643 return CHARGE_RETRY;
2644 2644
2645 if (invoke_oom) 2645 if (invoke_oom)
2646 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); 2646 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2647 2647
2648 return CHARGE_NOMEM; 2648 return CHARGE_NOMEM;
2649 } 2649 }
2650 2650
2651 /** 2651 /**
2652 * mem_cgroup_try_charge - try charging a memcg 2652 * mem_cgroup_try_charge - try charging a memcg
2653 * @memcg: memcg to charge 2653 * @memcg: memcg to charge
2654 * @nr_pages: number of pages to charge 2654 * @nr_pages: number of pages to charge
2655 * @oom: trigger OOM if reclaim fails 2655 * @oom: trigger OOM if reclaim fails
2656 * 2656 *
2657 * Returns 0 if @memcg was charged successfully, -EINTR if the charge 2657 * Returns 0 if @memcg was charged successfully, -EINTR if the charge
2658 * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. 2658 * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
2659 */ 2659 */
2660 static int mem_cgroup_try_charge(struct mem_cgroup *memcg, 2660 static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
2661 gfp_t gfp_mask, 2661 gfp_t gfp_mask,
2662 unsigned int nr_pages, 2662 unsigned int nr_pages,
2663 bool oom) 2663 bool oom)
2664 { 2664 {
2665 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2665 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2666 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2666 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2667 int ret; 2667 int ret;
2668 2668
2669 if (mem_cgroup_is_root(memcg)) 2669 if (mem_cgroup_is_root(memcg))
2670 goto done; 2670 goto done;
2671 /* 2671 /*
2672 * Unlike in global OOM situations, memcg is not in a physical 2672 * Unlike in global OOM situations, memcg is not in a physical
2673 * memory shortage. Allow dying and OOM-killed tasks to 2673 * memory shortage. Allow dying and OOM-killed tasks to
2674 * bypass the last charges so that they can exit quickly and 2674 * bypass the last charges so that they can exit quickly and
2675 * free their memory. 2675 * free their memory.
2676 */ 2676 */
2677 if (unlikely(test_thread_flag(TIF_MEMDIE) || 2677 if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2678 fatal_signal_pending(current))) 2678 fatal_signal_pending(current)))
2679 goto bypass; 2679 goto bypass;
2680 2680
2681 if (unlikely(task_in_memcg_oom(current))) 2681 if (unlikely(task_in_memcg_oom(current)))
2682 goto nomem; 2682 goto nomem;
2683 2683
2684 if (gfp_mask & __GFP_NOFAIL) 2684 if (gfp_mask & __GFP_NOFAIL)
2685 oom = false; 2685 oom = false;
2686 again: 2686 again:
2687 if (consume_stock(memcg, nr_pages)) 2687 if (consume_stock(memcg, nr_pages))
2688 goto done; 2688 goto done;
2689 2689
2690 do { 2690 do {
2691 bool invoke_oom = oom && !nr_oom_retries; 2691 bool invoke_oom = oom && !nr_oom_retries;
2692 2692
2693 /* If killed, bypass charge */ 2693 /* If killed, bypass charge */
2694 if (fatal_signal_pending(current)) 2694 if (fatal_signal_pending(current))
2695 goto bypass; 2695 goto bypass;
2696 2696
2697 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, 2697 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2698 nr_pages, invoke_oom); 2698 nr_pages, invoke_oom);
2699 switch (ret) { 2699 switch (ret) {
2700 case CHARGE_OK: 2700 case CHARGE_OK:
2701 break; 2701 break;
2702 case CHARGE_RETRY: /* not in OOM situation but retry */ 2702 case CHARGE_RETRY: /* not in OOM situation but retry */
2703 batch = nr_pages; 2703 batch = nr_pages;
2704 goto again; 2704 goto again;
2705 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2705 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2706 goto nomem; 2706 goto nomem;
2707 case CHARGE_NOMEM: /* OOM routine works */ 2707 case CHARGE_NOMEM: /* OOM routine works */
2708 if (!oom || invoke_oom) 2708 if (!oom || invoke_oom)
2709 goto nomem; 2709 goto nomem;
2710 nr_oom_retries--; 2710 nr_oom_retries--;
2711 break; 2711 break;
2712 } 2712 }
2713 } while (ret != CHARGE_OK); 2713 } while (ret != CHARGE_OK);
2714 2714
2715 if (batch > nr_pages) 2715 if (batch > nr_pages)
2716 refill_stock(memcg, batch - nr_pages); 2716 refill_stock(memcg, batch - nr_pages);
2717 done: 2717 done:
2718 return 0; 2718 return 0;
2719 nomem: 2719 nomem:
2720 if (!(gfp_mask & __GFP_NOFAIL)) 2720 if (!(gfp_mask & __GFP_NOFAIL))
2721 return -ENOMEM; 2721 return -ENOMEM;
2722 bypass: 2722 bypass:
2723 return -EINTR; 2723 return -EINTR;
2724 } 2724 }
2725 2725
2726 /** 2726 /**
2727 * mem_cgroup_try_charge_mm - try charging a mm 2727 * mem_cgroup_try_charge_mm - try charging a mm
2728 * @mm: mm_struct to charge 2728 * @mm: mm_struct to charge
2729 * @nr_pages: number of pages to charge 2729 * @nr_pages: number of pages to charge
2730 * @oom: trigger OOM if reclaim fails 2730 * @oom: trigger OOM if reclaim fails
2731 * 2731 *
2732 * Returns the charged mem_cgroup associated with the given mm_struct or 2732 * Returns the charged mem_cgroup associated with the given mm_struct or
2733 * NULL the charge failed. 2733 * NULL the charge failed.
2734 */ 2734 */
2735 static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, 2735 static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
2736 gfp_t gfp_mask, 2736 gfp_t gfp_mask,
2737 unsigned int nr_pages, 2737 unsigned int nr_pages,
2738 bool oom) 2738 bool oom)
2739 2739
2740 { 2740 {
2741 struct mem_cgroup *memcg; 2741 struct mem_cgroup *memcg;
2742 int ret; 2742 int ret;
2743 2743
2744 memcg = get_mem_cgroup_from_mm(mm); 2744 memcg = get_mem_cgroup_from_mm(mm);
2745 ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); 2745 ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
2746 css_put(&memcg->css); 2746 css_put(&memcg->css);
2747 if (ret == -EINTR) 2747 if (ret == -EINTR)
2748 memcg = root_mem_cgroup; 2748 memcg = root_mem_cgroup;
2749 else if (ret) 2749 else if (ret)
2750 memcg = NULL; 2750 memcg = NULL;
2751 2751
2752 return memcg; 2752 return memcg;
2753 } 2753 }
2754 2754
2755 /* 2755 /*
2756 * Somemtimes we have to undo a charge we got by try_charge(). 2756 * Somemtimes we have to undo a charge we got by try_charge().
2757 * This function is for that and do uncharge, put css's refcnt. 2757 * This function is for that and do uncharge, put css's refcnt.
2758 * gotten by try_charge(). 2758 * gotten by try_charge().
2759 */ 2759 */
2760 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2760 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2761 unsigned int nr_pages) 2761 unsigned int nr_pages)
2762 { 2762 {
2763 if (!mem_cgroup_is_root(memcg)) { 2763 if (!mem_cgroup_is_root(memcg)) {
2764 unsigned long bytes = nr_pages * PAGE_SIZE; 2764 unsigned long bytes = nr_pages * PAGE_SIZE;
2765 2765
2766 res_counter_uncharge(&memcg->res, bytes); 2766 res_counter_uncharge(&memcg->res, bytes);
2767 if (do_swap_account) 2767 if (do_swap_account)
2768 res_counter_uncharge(&memcg->memsw, bytes); 2768 res_counter_uncharge(&memcg->memsw, bytes);
2769 } 2769 }
2770 } 2770 }
2771 2771
2772 /* 2772 /*
2773 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2773 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2774 * This is useful when moving usage to parent cgroup. 2774 * This is useful when moving usage to parent cgroup.
2775 */ 2775 */
2776 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2776 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2777 unsigned int nr_pages) 2777 unsigned int nr_pages)
2778 { 2778 {
2779 unsigned long bytes = nr_pages * PAGE_SIZE; 2779 unsigned long bytes = nr_pages * PAGE_SIZE;
2780 2780
2781 if (mem_cgroup_is_root(memcg)) 2781 if (mem_cgroup_is_root(memcg))
2782 return; 2782 return;
2783 2783
2784 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2784 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2785 if (do_swap_account) 2785 if (do_swap_account)
2786 res_counter_uncharge_until(&memcg->memsw, 2786 res_counter_uncharge_until(&memcg->memsw,
2787 memcg->memsw.parent, bytes); 2787 memcg->memsw.parent, bytes);
2788 } 2788 }
2789 2789
2790 /* 2790 /*
2791 * A helper function to get mem_cgroup from ID. must be called under 2791 * A helper function to get mem_cgroup from ID. must be called under
2792 * rcu_read_lock(). The caller is responsible for calling css_tryget if 2792 * rcu_read_lock(). The caller is responsible for calling css_tryget if
2793 * the mem_cgroup is used for charging. (dropping refcnt from swap can be 2793 * the mem_cgroup is used for charging. (dropping refcnt from swap can be
2794 * called against removed memcg.) 2794 * called against removed memcg.)
2795 */ 2795 */
2796 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2796 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2797 { 2797 {
2798 /* ID 0 is unused ID */ 2798 /* ID 0 is unused ID */
2799 if (!id) 2799 if (!id)
2800 return NULL; 2800 return NULL;
2801 return mem_cgroup_from_id(id); 2801 return mem_cgroup_from_id(id);
2802 } 2802 }
2803 2803
2804 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2804 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2805 { 2805 {
2806 struct mem_cgroup *memcg = NULL; 2806 struct mem_cgroup *memcg = NULL;
2807 struct page_cgroup *pc; 2807 struct page_cgroup *pc;
2808 unsigned short id; 2808 unsigned short id;
2809 swp_entry_t ent; 2809 swp_entry_t ent;
2810 2810
2811 VM_BUG_ON_PAGE(!PageLocked(page), page); 2811 VM_BUG_ON_PAGE(!PageLocked(page), page);
2812 2812
2813 pc = lookup_page_cgroup(page); 2813 pc = lookup_page_cgroup(page);
2814 lock_page_cgroup(pc); 2814 lock_page_cgroup(pc);
2815 if (PageCgroupUsed(pc)) { 2815 if (PageCgroupUsed(pc)) {
2816 memcg = pc->mem_cgroup; 2816 memcg = pc->mem_cgroup;
2817 if (memcg && !css_tryget(&memcg->css)) 2817 if (memcg && !css_tryget(&memcg->css))
2818 memcg = NULL; 2818 memcg = NULL;
2819 } else if (PageSwapCache(page)) { 2819 } else if (PageSwapCache(page)) {
2820 ent.val = page_private(page); 2820 ent.val = page_private(page);
2821 id = lookup_swap_cgroup_id(ent); 2821 id = lookup_swap_cgroup_id(ent);
2822 rcu_read_lock(); 2822 rcu_read_lock();
2823 memcg = mem_cgroup_lookup(id); 2823 memcg = mem_cgroup_lookup(id);
2824 if (memcg && !css_tryget(&memcg->css)) 2824 if (memcg && !css_tryget(&memcg->css))
2825 memcg = NULL; 2825 memcg = NULL;
2826 rcu_read_unlock(); 2826 rcu_read_unlock();
2827 } 2827 }
2828 unlock_page_cgroup(pc); 2828 unlock_page_cgroup(pc);
2829 return memcg; 2829 return memcg;
2830 } 2830 }
2831 2831
2832 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2832 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2833 struct page *page, 2833 struct page *page,
2834 unsigned int nr_pages, 2834 unsigned int nr_pages,
2835 enum charge_type ctype, 2835 enum charge_type ctype,
2836 bool lrucare) 2836 bool lrucare)
2837 { 2837 {
2838 struct page_cgroup *pc = lookup_page_cgroup(page); 2838 struct page_cgroup *pc = lookup_page_cgroup(page);
2839 struct zone *uninitialized_var(zone); 2839 struct zone *uninitialized_var(zone);
2840 struct lruvec *lruvec; 2840 struct lruvec *lruvec;
2841 bool was_on_lru = false; 2841 bool was_on_lru = false;
2842 bool anon; 2842 bool anon;
2843 2843
2844 lock_page_cgroup(pc); 2844 lock_page_cgroup(pc);
2845 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); 2845 VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
2846 /* 2846 /*
2847 * we don't need page_cgroup_lock about tail pages, becase they are not 2847 * we don't need page_cgroup_lock about tail pages, becase they are not
2848 * accessed by any other context at this point. 2848 * accessed by any other context at this point.
2849 */ 2849 */
2850 2850
2851 /* 2851 /*
2852 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2852 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2853 * may already be on some other mem_cgroup's LRU. Take care of it. 2853 * may already be on some other mem_cgroup's LRU. Take care of it.
2854 */ 2854 */
2855 if (lrucare) { 2855 if (lrucare) {
2856 zone = page_zone(page); 2856 zone = page_zone(page);
2857 spin_lock_irq(&zone->lru_lock); 2857 spin_lock_irq(&zone->lru_lock);
2858 if (PageLRU(page)) { 2858 if (PageLRU(page)) {
2859 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2859 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2860 ClearPageLRU(page); 2860 ClearPageLRU(page);
2861 del_page_from_lru_list(page, lruvec, page_lru(page)); 2861 del_page_from_lru_list(page, lruvec, page_lru(page));
2862 was_on_lru = true; 2862 was_on_lru = true;
2863 } 2863 }
2864 } 2864 }
2865 2865
2866 pc->mem_cgroup = memcg; 2866 pc->mem_cgroup = memcg;
2867 /* 2867 /*
2868 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2868 * We access a page_cgroup asynchronously without lock_page_cgroup().
2869 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2869 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2870 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2870 * is accessed after testing USED bit. To make pc->mem_cgroup visible
2871 * before USED bit, we need memory barrier here. 2871 * before USED bit, we need memory barrier here.
2872 * See mem_cgroup_add_lru_list(), etc. 2872 * See mem_cgroup_add_lru_list(), etc.
2873 */ 2873 */
2874 smp_wmb(); 2874 smp_wmb();
2875 SetPageCgroupUsed(pc); 2875 SetPageCgroupUsed(pc);
2876 2876
2877 if (lrucare) { 2877 if (lrucare) {
2878 if (was_on_lru) { 2878 if (was_on_lru) {
2879 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2879 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2880 VM_BUG_ON_PAGE(PageLRU(page), page); 2880 VM_BUG_ON_PAGE(PageLRU(page), page);
2881 SetPageLRU(page); 2881 SetPageLRU(page);
2882 add_page_to_lru_list(page, lruvec, page_lru(page)); 2882 add_page_to_lru_list(page, lruvec, page_lru(page));
2883 } 2883 }
2884 spin_unlock_irq(&zone->lru_lock); 2884 spin_unlock_irq(&zone->lru_lock);
2885 } 2885 }
2886 2886
2887 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) 2887 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
2888 anon = true; 2888 anon = true;
2889 else 2889 else
2890 anon = false; 2890 anon = false;
2891 2891
2892 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2892 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
2893 unlock_page_cgroup(pc); 2893 unlock_page_cgroup(pc);
2894 2894
2895 /* 2895 /*
2896 * "charge_statistics" updated event counter. Then, check it. 2896 * "charge_statistics" updated event counter. Then, check it.
2897 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2897 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2898 * if they exceeds softlimit. 2898 * if they exceeds softlimit.
2899 */ 2899 */
2900 memcg_check_events(memcg, page); 2900 memcg_check_events(memcg, page);
2901 } 2901 }
2902 2902
2903 static DEFINE_MUTEX(set_limit_mutex); 2903 static DEFINE_MUTEX(set_limit_mutex);
2904 2904
2905 #ifdef CONFIG_MEMCG_KMEM 2905 #ifdef CONFIG_MEMCG_KMEM
2906 static DEFINE_MUTEX(activate_kmem_mutex); 2906 static DEFINE_MUTEX(activate_kmem_mutex);
2907 2907
2908 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2908 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
2909 { 2909 {
2910 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 2910 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
2911 memcg_kmem_is_active(memcg); 2911 memcg_kmem_is_active(memcg);
2912 } 2912 }
2913 2913
2914 /* 2914 /*
2915 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2915 * This is a bit cumbersome, but it is rarely used and avoids a backpointer
2916 * in the memcg_cache_params struct. 2916 * in the memcg_cache_params struct.
2917 */ 2917 */
2918 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2918 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
2919 { 2919 {
2920 struct kmem_cache *cachep; 2920 struct kmem_cache *cachep;
2921 2921
2922 VM_BUG_ON(p->is_root_cache); 2922 VM_BUG_ON(p->is_root_cache);
2923 cachep = p->root_cache; 2923 cachep = p->root_cache;
2924 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2924 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
2925 } 2925 }
2926 2926
2927 #ifdef CONFIG_SLABINFO 2927 #ifdef CONFIG_SLABINFO
2928 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) 2928 static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
2929 { 2929 {
2930 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 2930 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
2931 struct memcg_cache_params *params; 2931 struct memcg_cache_params *params;
2932 2932
2933 if (!memcg_can_account_kmem(memcg)) 2933 if (!memcg_can_account_kmem(memcg))
2934 return -EIO; 2934 return -EIO;
2935 2935
2936 print_slabinfo_header(m); 2936 print_slabinfo_header(m);
2937 2937
2938 mutex_lock(&memcg->slab_caches_mutex); 2938 mutex_lock(&memcg->slab_caches_mutex);
2939 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 2939 list_for_each_entry(params, &memcg->memcg_slab_caches, list)
2940 cache_show(memcg_params_to_cache(params), m); 2940 cache_show(memcg_params_to_cache(params), m);
2941 mutex_unlock(&memcg->slab_caches_mutex); 2941 mutex_unlock(&memcg->slab_caches_mutex);
2942 2942
2943 return 0; 2943 return 0;
2944 } 2944 }
2945 #endif 2945 #endif
2946 2946
2947 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 2947 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
2948 { 2948 {
2949 struct res_counter *fail_res; 2949 struct res_counter *fail_res;
2950 int ret = 0; 2950 int ret = 0;
2951 2951
2952 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 2952 ret = res_counter_charge(&memcg->kmem, size, &fail_res);
2953 if (ret) 2953 if (ret)
2954 return ret; 2954 return ret;
2955 2955
2956 ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, 2956 ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT,
2957 oom_gfp_allowed(gfp)); 2957 oom_gfp_allowed(gfp));
2958 if (ret == -EINTR) { 2958 if (ret == -EINTR) {
2959 /* 2959 /*
2960 * mem_cgroup_try_charge() chosed to bypass to root due to 2960 * mem_cgroup_try_charge() chosed to bypass to root due to
2961 * OOM kill or fatal signal. Since our only options are to 2961 * OOM kill or fatal signal. Since our only options are to
2962 * either fail the allocation or charge it to this cgroup, do 2962 * either fail the allocation or charge it to this cgroup, do
2963 * it as a temporary condition. But we can't fail. From a 2963 * it as a temporary condition. But we can't fail. From a
2964 * kmem/slab perspective, the cache has already been selected, 2964 * kmem/slab perspective, the cache has already been selected,
2965 * by mem_cgroup_kmem_get_cache(), so it is too late to change 2965 * by mem_cgroup_kmem_get_cache(), so it is too late to change
2966 * our minds. 2966 * our minds.
2967 * 2967 *
2968 * This condition will only trigger if the task entered 2968 * This condition will only trigger if the task entered
2969 * memcg_charge_kmem in a sane state, but was OOM-killed during 2969 * memcg_charge_kmem in a sane state, but was OOM-killed during
2970 * mem_cgroup_try_charge() above. Tasks that were already 2970 * mem_cgroup_try_charge() above. Tasks that were already
2971 * dying when the allocation triggers should have been already 2971 * dying when the allocation triggers should have been already
2972 * directed to the root cgroup in memcontrol.h 2972 * directed to the root cgroup in memcontrol.h
2973 */ 2973 */
2974 res_counter_charge_nofail(&memcg->res, size, &fail_res); 2974 res_counter_charge_nofail(&memcg->res, size, &fail_res);
2975 if (do_swap_account) 2975 if (do_swap_account)
2976 res_counter_charge_nofail(&memcg->memsw, size, 2976 res_counter_charge_nofail(&memcg->memsw, size,
2977 &fail_res); 2977 &fail_res);
2978 ret = 0; 2978 ret = 0;
2979 } else if (ret) 2979 } else if (ret)
2980 res_counter_uncharge(&memcg->kmem, size); 2980 res_counter_uncharge(&memcg->kmem, size);
2981 2981
2982 return ret; 2982 return ret;
2983 } 2983 }
2984 2984
2985 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 2985 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
2986 { 2986 {
2987 res_counter_uncharge(&memcg->res, size); 2987 res_counter_uncharge(&memcg->res, size);
2988 if (do_swap_account) 2988 if (do_swap_account)
2989 res_counter_uncharge(&memcg->memsw, size); 2989 res_counter_uncharge(&memcg->memsw, size);
2990 2990
2991 /* Not down to 0 */ 2991 /* Not down to 0 */
2992 if (res_counter_uncharge(&memcg->kmem, size)) 2992 if (res_counter_uncharge(&memcg->kmem, size))
2993 return; 2993 return;
2994 2994
2995 /* 2995 /*
2996 * Releases a reference taken in kmem_cgroup_css_offline in case 2996 * Releases a reference taken in kmem_cgroup_css_offline in case
2997 * this last uncharge is racing with the offlining code or it is 2997 * this last uncharge is racing with the offlining code or it is
2998 * outliving the memcg existence. 2998 * outliving the memcg existence.
2999 * 2999 *
3000 * The memory barrier imposed by test&clear is paired with the 3000 * The memory barrier imposed by test&clear is paired with the
3001 * explicit one in memcg_kmem_mark_dead(). 3001 * explicit one in memcg_kmem_mark_dead().
3002 */ 3002 */
3003 if (memcg_kmem_test_and_clear_dead(memcg)) 3003 if (memcg_kmem_test_and_clear_dead(memcg))
3004 css_put(&memcg->css); 3004 css_put(&memcg->css);
3005 } 3005 }
3006 3006
3007 /* 3007 /*
3008 * helper for acessing a memcg's index. It will be used as an index in the 3008 * helper for acessing a memcg's index. It will be used as an index in the
3009 * child cache array in kmem_cache, and also to derive its name. This function 3009 * child cache array in kmem_cache, and also to derive its name. This function
3010 * will return -1 when this is not a kmem-limited memcg. 3010 * will return -1 when this is not a kmem-limited memcg.
3011 */ 3011 */
3012 int memcg_cache_id(struct mem_cgroup *memcg) 3012 int memcg_cache_id(struct mem_cgroup *memcg)
3013 { 3013 {
3014 return memcg ? memcg->kmemcg_id : -1; 3014 return memcg ? memcg->kmemcg_id : -1;
3015 } 3015 }
3016 3016
3017 static size_t memcg_caches_array_size(int num_groups) 3017 static size_t memcg_caches_array_size(int num_groups)
3018 { 3018 {
3019 ssize_t size; 3019 ssize_t size;
3020 if (num_groups <= 0) 3020 if (num_groups <= 0)
3021 return 0; 3021 return 0;
3022 3022
3023 size = 2 * num_groups; 3023 size = 2 * num_groups;
3024 if (size < MEMCG_CACHES_MIN_SIZE) 3024 if (size < MEMCG_CACHES_MIN_SIZE)
3025 size = MEMCG_CACHES_MIN_SIZE; 3025 size = MEMCG_CACHES_MIN_SIZE;
3026 else if (size > MEMCG_CACHES_MAX_SIZE) 3026 else if (size > MEMCG_CACHES_MAX_SIZE)
3027 size = MEMCG_CACHES_MAX_SIZE; 3027 size = MEMCG_CACHES_MAX_SIZE;
3028 3028
3029 return size; 3029 return size;
3030 } 3030 }
3031 3031
3032 /* 3032 /*
3033 * We should update the current array size iff all caches updates succeed. This 3033 * We should update the current array size iff all caches updates succeed. This
3034 * can only be done from the slab side. The slab mutex needs to be held when 3034 * can only be done from the slab side. The slab mutex needs to be held when
3035 * calling this. 3035 * calling this.
3036 */ 3036 */
3037 void memcg_update_array_size(int num) 3037 void memcg_update_array_size(int num)
3038 { 3038 {
3039 if (num > memcg_limited_groups_array_size) 3039 if (num > memcg_limited_groups_array_size)
3040 memcg_limited_groups_array_size = memcg_caches_array_size(num); 3040 memcg_limited_groups_array_size = memcg_caches_array_size(num);
3041 } 3041 }
3042 3042
3043 static void kmem_cache_destroy_work_func(struct work_struct *w); 3043 static void kmem_cache_destroy_work_func(struct work_struct *w);
3044 3044
3045 int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 3045 int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
3046 { 3046 {
3047 struct memcg_cache_params *cur_params = s->memcg_params; 3047 struct memcg_cache_params *cur_params = s->memcg_params;
3048 3048
3049 VM_BUG_ON(!is_root_cache(s)); 3049 VM_BUG_ON(!is_root_cache(s));
3050 3050
3051 if (num_groups > memcg_limited_groups_array_size) { 3051 if (num_groups > memcg_limited_groups_array_size) {
3052 int i; 3052 int i;
3053 struct memcg_cache_params *new_params; 3053 struct memcg_cache_params *new_params;
3054 ssize_t size = memcg_caches_array_size(num_groups); 3054 ssize_t size = memcg_caches_array_size(num_groups);
3055 3055
3056 size *= sizeof(void *); 3056 size *= sizeof(void *);
3057 size += offsetof(struct memcg_cache_params, memcg_caches); 3057 size += offsetof(struct memcg_cache_params, memcg_caches);
3058 3058
3059 new_params = kzalloc(size, GFP_KERNEL); 3059 new_params = kzalloc(size, GFP_KERNEL);
3060 if (!new_params) 3060 if (!new_params)
3061 return -ENOMEM; 3061 return -ENOMEM;
3062 3062
3063 new_params->is_root_cache = true; 3063 new_params->is_root_cache = true;
3064 3064
3065 /* 3065 /*
3066 * There is the chance it will be bigger than 3066 * There is the chance it will be bigger than
3067 * memcg_limited_groups_array_size, if we failed an allocation 3067 * memcg_limited_groups_array_size, if we failed an allocation
3068 * in a cache, in which case all caches updated before it, will 3068 * in a cache, in which case all caches updated before it, will
3069 * have a bigger array. 3069 * have a bigger array.
3070 * 3070 *
3071 * But if that is the case, the data after 3071 * But if that is the case, the data after
3072 * memcg_limited_groups_array_size is certainly unused 3072 * memcg_limited_groups_array_size is certainly unused
3073 */ 3073 */
3074 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3074 for (i = 0; i < memcg_limited_groups_array_size; i++) {
3075 if (!cur_params->memcg_caches[i]) 3075 if (!cur_params->memcg_caches[i])
3076 continue; 3076 continue;
3077 new_params->memcg_caches[i] = 3077 new_params->memcg_caches[i] =
3078 cur_params->memcg_caches[i]; 3078 cur_params->memcg_caches[i];
3079 } 3079 }
3080 3080
3081 /* 3081 /*
3082 * Ideally, we would wait until all caches succeed, and only 3082 * Ideally, we would wait until all caches succeed, and only
3083 * then free the old one. But this is not worth the extra 3083 * then free the old one. But this is not worth the extra
3084 * pointer per-cache we'd have to have for this. 3084 * pointer per-cache we'd have to have for this.
3085 * 3085 *
3086 * It is not a big deal if some caches are left with a size 3086 * It is not a big deal if some caches are left with a size
3087 * bigger than the others. And all updates will reset this 3087 * bigger than the others. And all updates will reset this
3088 * anyway. 3088 * anyway.
3089 */ 3089 */
3090 rcu_assign_pointer(s->memcg_params, new_params); 3090 rcu_assign_pointer(s->memcg_params, new_params);
3091 if (cur_params) 3091 if (cur_params)
3092 kfree_rcu(cur_params, rcu_head); 3092 kfree_rcu(cur_params, rcu_head);
3093 } 3093 }
3094 return 0; 3094 return 0;
3095 } 3095 }
3096 3096
3097 char *memcg_create_cache_name(struct mem_cgroup *memcg, 3097 char *memcg_create_cache_name(struct mem_cgroup *memcg,
3098 struct kmem_cache *root_cache) 3098 struct kmem_cache *root_cache)
3099 { 3099 {
3100 static char *buf = NULL; 3100 static char *buf = NULL;
3101 3101
3102 /* 3102 /*
3103 * We need a mutex here to protect the shared buffer. Since this is 3103 * We need a mutex here to protect the shared buffer. Since this is
3104 * expected to be called only on cache creation, we can employ the 3104 * expected to be called only on cache creation, we can employ the
3105 * slab_mutex for that purpose. 3105 * slab_mutex for that purpose.
3106 */ 3106 */
3107 lockdep_assert_held(&slab_mutex); 3107 lockdep_assert_held(&slab_mutex);
3108 3108
3109 if (!buf) { 3109 if (!buf) {
3110 buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); 3110 buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
3111 if (!buf) 3111 if (!buf)
3112 return NULL; 3112 return NULL;
3113 } 3113 }
3114 3114
3115 cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); 3115 cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1);
3116 return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, 3116 return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
3117 memcg_cache_id(memcg), buf); 3117 memcg_cache_id(memcg), buf);
3118 } 3118 }
3119 3119
3120 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, 3120 int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
3121 struct kmem_cache *root_cache) 3121 struct kmem_cache *root_cache)
3122 { 3122 {
3123 size_t size; 3123 size_t size;
3124 3124
3125 if (!memcg_kmem_enabled()) 3125 if (!memcg_kmem_enabled())
3126 return 0; 3126 return 0;
3127 3127
3128 if (!memcg) { 3128 if (!memcg) {
3129 size = offsetof(struct memcg_cache_params, memcg_caches); 3129 size = offsetof(struct memcg_cache_params, memcg_caches);
3130 size += memcg_limited_groups_array_size * sizeof(void *); 3130 size += memcg_limited_groups_array_size * sizeof(void *);
3131 } else 3131 } else
3132 size = sizeof(struct memcg_cache_params); 3132 size = sizeof(struct memcg_cache_params);
3133 3133
3134 s->memcg_params = kzalloc(size, GFP_KERNEL); 3134 s->memcg_params = kzalloc(size, GFP_KERNEL);
3135 if (!s->memcg_params) 3135 if (!s->memcg_params)
3136 return -ENOMEM; 3136 return -ENOMEM;
3137 3137
3138 if (memcg) { 3138 if (memcg) {
3139 s->memcg_params->memcg = memcg; 3139 s->memcg_params->memcg = memcg;
3140 s->memcg_params->root_cache = root_cache; 3140 s->memcg_params->root_cache = root_cache;
3141 INIT_WORK(&s->memcg_params->destroy, 3141 INIT_WORK(&s->memcg_params->destroy,
3142 kmem_cache_destroy_work_func); 3142 kmem_cache_destroy_work_func);
3143 } else 3143 } else
3144 s->memcg_params->is_root_cache = true; 3144 s->memcg_params->is_root_cache = true;
3145 3145
3146 return 0; 3146 return 0;
3147 } 3147 }
3148 3148
3149 void memcg_free_cache_params(struct kmem_cache *s) 3149 void memcg_free_cache_params(struct kmem_cache *s)
3150 { 3150 {
3151 kfree(s->memcg_params); 3151 kfree(s->memcg_params);
3152 } 3152 }
3153 3153
3154 void memcg_register_cache(struct kmem_cache *s) 3154 void memcg_register_cache(struct kmem_cache *s)
3155 { 3155 {
3156 struct kmem_cache *root; 3156 struct kmem_cache *root;
3157 struct mem_cgroup *memcg; 3157 struct mem_cgroup *memcg;
3158 int id; 3158 int id;
3159 3159
3160 if (is_root_cache(s)) 3160 if (is_root_cache(s))
3161 return; 3161 return;
3162 3162
3163 /* 3163 /*
3164 * Holding the slab_mutex assures nobody will touch the memcg_caches 3164 * Holding the slab_mutex assures nobody will touch the memcg_caches
3165 * array while we are modifying it. 3165 * array while we are modifying it.
3166 */ 3166 */
3167 lockdep_assert_held(&slab_mutex); 3167 lockdep_assert_held(&slab_mutex);
3168 3168
3169 root = s->memcg_params->root_cache; 3169 root = s->memcg_params->root_cache;
3170 memcg = s->memcg_params->memcg; 3170 memcg = s->memcg_params->memcg;
3171 id = memcg_cache_id(memcg); 3171 id = memcg_cache_id(memcg);
3172 3172
3173 css_get(&memcg->css); 3173 css_get(&memcg->css);
3174 3174
3175 3175
3176 /* 3176 /*
3177 * Since readers won't lock (see cache_from_memcg_idx()), we need a 3177 * Since readers won't lock (see cache_from_memcg_idx()), we need a
3178 * barrier here to ensure nobody will see the kmem_cache partially 3178 * barrier here to ensure nobody will see the kmem_cache partially
3179 * initialized. 3179 * initialized.
3180 */ 3180 */
3181 smp_wmb(); 3181 smp_wmb();
3182 3182
3183 /* 3183 /*
3184 * Initialize the pointer to this cache in its parent's memcg_params 3184 * Initialize the pointer to this cache in its parent's memcg_params
3185 * before adding it to the memcg_slab_caches list, otherwise we can 3185 * before adding it to the memcg_slab_caches list, otherwise we can
3186 * fail to convert memcg_params_to_cache() while traversing the list. 3186 * fail to convert memcg_params_to_cache() while traversing the list.
3187 */ 3187 */
3188 VM_BUG_ON(root->memcg_params->memcg_caches[id]); 3188 VM_BUG_ON(root->memcg_params->memcg_caches[id]);
3189 root->memcg_params->memcg_caches[id] = s; 3189 root->memcg_params->memcg_caches[id] = s;
3190 3190
3191 mutex_lock(&memcg->slab_caches_mutex); 3191 mutex_lock(&memcg->slab_caches_mutex);
3192 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); 3192 list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
3193 mutex_unlock(&memcg->slab_caches_mutex); 3193 mutex_unlock(&memcg->slab_caches_mutex);
3194 } 3194 }
3195 3195
3196 void memcg_unregister_cache(struct kmem_cache *s) 3196 void memcg_unregister_cache(struct kmem_cache *s)
3197 { 3197 {
3198 struct kmem_cache *root; 3198 struct kmem_cache *root;
3199 struct mem_cgroup *memcg; 3199 struct mem_cgroup *memcg;
3200 int id; 3200 int id;
3201 3201
3202 if (is_root_cache(s)) 3202 if (is_root_cache(s))
3203 return; 3203 return;
3204 3204
3205 /* 3205 /*
3206 * Holding the slab_mutex assures nobody will touch the memcg_caches 3206 * Holding the slab_mutex assures nobody will touch the memcg_caches
3207 * array while we are modifying it. 3207 * array while we are modifying it.
3208 */ 3208 */
3209 lockdep_assert_held(&slab_mutex); 3209 lockdep_assert_held(&slab_mutex);
3210 3210
3211 root = s->memcg_params->root_cache; 3211 root = s->memcg_params->root_cache;
3212 memcg = s->memcg_params->memcg; 3212 memcg = s->memcg_params->memcg;
3213 id = memcg_cache_id(memcg); 3213 id = memcg_cache_id(memcg);
3214 3214
3215 mutex_lock(&memcg->slab_caches_mutex); 3215 mutex_lock(&memcg->slab_caches_mutex);
3216 list_del(&s->memcg_params->list); 3216 list_del(&s->memcg_params->list);
3217 mutex_unlock(&memcg->slab_caches_mutex); 3217 mutex_unlock(&memcg->slab_caches_mutex);
3218 3218
3219 /* 3219 /*
3220 * Clear the pointer to this cache in its parent's memcg_params only 3220 * Clear the pointer to this cache in its parent's memcg_params only
3221 * after removing it from the memcg_slab_caches list, otherwise we can 3221 * after removing it from the memcg_slab_caches list, otherwise we can
3222 * fail to convert memcg_params_to_cache() while traversing the list. 3222 * fail to convert memcg_params_to_cache() while traversing the list.
3223 */ 3223 */
3224 VM_BUG_ON(!root->memcg_params->memcg_caches[id]); 3224 VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
3225 root->memcg_params->memcg_caches[id] = NULL; 3225 root->memcg_params->memcg_caches[id] = NULL;
3226 3226
3227 css_put(&memcg->css); 3227 css_put(&memcg->css);
3228 } 3228 }
3229 3229
3230 /* 3230 /*
3231 * During the creation a new cache, we need to disable our accounting mechanism 3231 * During the creation a new cache, we need to disable our accounting mechanism
3232 * altogether. This is true even if we are not creating, but rather just 3232 * altogether. This is true even if we are not creating, but rather just
3233 * enqueing new caches to be created. 3233 * enqueing new caches to be created.
3234 * 3234 *
3235 * This is because that process will trigger allocations; some visible, like 3235 * This is because that process will trigger allocations; some visible, like
3236 * explicit kmallocs to auxiliary data structures, name strings and internal 3236 * explicit kmallocs to auxiliary data structures, name strings and internal
3237 * cache structures; some well concealed, like INIT_WORK() that can allocate 3237 * cache structures; some well concealed, like INIT_WORK() that can allocate
3238 * objects during debug. 3238 * objects during debug.
3239 * 3239 *
3240 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 3240 * If any allocation happens during memcg_kmem_get_cache, we will recurse back
3241 * to it. This may not be a bounded recursion: since the first cache creation 3241 * to it. This may not be a bounded recursion: since the first cache creation
3242 * failed to complete (waiting on the allocation), we'll just try to create the 3242 * failed to complete (waiting on the allocation), we'll just try to create the
3243 * cache again, failing at the same point. 3243 * cache again, failing at the same point.
3244 * 3244 *
3245 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 3245 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
3246 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 3246 * memcg_kmem_skip_account. So we enclose anything that might allocate memory
3247 * inside the following two functions. 3247 * inside the following two functions.
3248 */ 3248 */
3249 static inline void memcg_stop_kmem_account(void) 3249 static inline void memcg_stop_kmem_account(void)
3250 { 3250 {
3251 VM_BUG_ON(!current->mm); 3251 VM_BUG_ON(!current->mm);
3252 current->memcg_kmem_skip_account++; 3252 current->memcg_kmem_skip_account++;
3253 } 3253 }
3254 3254
3255 static inline void memcg_resume_kmem_account(void) 3255 static inline void memcg_resume_kmem_account(void)
3256 { 3256 {
3257 VM_BUG_ON(!current->mm); 3257 VM_BUG_ON(!current->mm);
3258 current->memcg_kmem_skip_account--; 3258 current->memcg_kmem_skip_account--;
3259 } 3259 }
3260 3260
3261 static void kmem_cache_destroy_work_func(struct work_struct *w) 3261 static void kmem_cache_destroy_work_func(struct work_struct *w)
3262 { 3262 {
3263 struct kmem_cache *cachep; 3263 struct kmem_cache *cachep;
3264 struct memcg_cache_params *p; 3264 struct memcg_cache_params *p;
3265 3265
3266 p = container_of(w, struct memcg_cache_params, destroy); 3266 p = container_of(w, struct memcg_cache_params, destroy);
3267 3267
3268 cachep = memcg_params_to_cache(p); 3268 cachep = memcg_params_to_cache(p);
3269 3269
3270 /* 3270 /*
3271 * If we get down to 0 after shrink, we could delete right away. 3271 * If we get down to 0 after shrink, we could delete right away.
3272 * However, memcg_release_pages() already puts us back in the workqueue 3272 * However, memcg_release_pages() already puts us back in the workqueue
3273 * in that case. If we proceed deleting, we'll get a dangling 3273 * in that case. If we proceed deleting, we'll get a dangling
3274 * reference, and removing the object from the workqueue in that case 3274 * reference, and removing the object from the workqueue in that case
3275 * is unnecessary complication. We are not a fast path. 3275 * is unnecessary complication. We are not a fast path.
3276 * 3276 *
3277 * Note that this case is fundamentally different from racing with 3277 * Note that this case is fundamentally different from racing with
3278 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in 3278 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
3279 * kmem_cache_shrink, not only we would be reinserting a dead cache 3279 * kmem_cache_shrink, not only we would be reinserting a dead cache
3280 * into the queue, but doing so from inside the worker racing to 3280 * into the queue, but doing so from inside the worker racing to
3281 * destroy it. 3281 * destroy it.
3282 * 3282 *
3283 * So if we aren't down to zero, we'll just schedule a worker and try 3283 * So if we aren't down to zero, we'll just schedule a worker and try
3284 * again 3284 * again
3285 */ 3285 */
3286 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) 3286 if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
3287 kmem_cache_shrink(cachep); 3287 kmem_cache_shrink(cachep);
3288 else 3288 else
3289 kmem_cache_destroy(cachep); 3289 kmem_cache_destroy(cachep);
3290 } 3290 }
3291 3291
3292 void mem_cgroup_destroy_cache(struct kmem_cache *cachep) 3292 void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
3293 { 3293 {
3294 if (!cachep->memcg_params->dead) 3294 if (!cachep->memcg_params->dead)
3295 return; 3295 return;
3296 3296
3297 /* 3297 /*
3298 * There are many ways in which we can get here. 3298 * There are many ways in which we can get here.
3299 * 3299 *
3300 * We can get to a memory-pressure situation while the delayed work is 3300 * We can get to a memory-pressure situation while the delayed work is
3301 * still pending to run. The vmscan shrinkers can then release all 3301 * still pending to run. The vmscan shrinkers can then release all
3302 * cache memory and get us to destruction. If this is the case, we'll 3302 * cache memory and get us to destruction. If this is the case, we'll
3303 * be executed twice, which is a bug (the second time will execute over 3303 * be executed twice, which is a bug (the second time will execute over
3304 * bogus data). In this case, cancelling the work should be fine. 3304 * bogus data). In this case, cancelling the work should be fine.
3305 * 3305 *
3306 * But we can also get here from the worker itself, if 3306 * But we can also get here from the worker itself, if
3307 * kmem_cache_shrink is enough to shake all the remaining objects and 3307 * kmem_cache_shrink is enough to shake all the remaining objects and
3308 * get the page count to 0. In this case, we'll deadlock if we try to 3308 * get the page count to 0. In this case, we'll deadlock if we try to
3309 * cancel the work (the worker runs with an internal lock held, which 3309 * cancel the work (the worker runs with an internal lock held, which
3310 * is the same lock we would hold for cancel_work_sync().) 3310 * is the same lock we would hold for cancel_work_sync().)
3311 * 3311 *
3312 * Since we can't possibly know who got us here, just refrain from 3312 * Since we can't possibly know who got us here, just refrain from
3313 * running if there is already work pending 3313 * running if there is already work pending
3314 */ 3314 */
3315 if (work_pending(&cachep->memcg_params->destroy)) 3315 if (work_pending(&cachep->memcg_params->destroy))
3316 return; 3316 return;
3317 /* 3317 /*
3318 * We have to defer the actual destroying to a workqueue, because 3318 * We have to defer the actual destroying to a workqueue, because
3319 * we might currently be in a context that cannot sleep. 3319 * we might currently be in a context that cannot sleep.
3320 */ 3320 */
3321 schedule_work(&cachep->memcg_params->destroy); 3321 schedule_work(&cachep->memcg_params->destroy);
3322 } 3322 }
3323 3323
3324 void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 3324 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
3325 { 3325 {
3326 struct kmem_cache *c; 3326 struct kmem_cache *c;
3327 int i; 3327 int i;
3328 3328
3329 if (!s->memcg_params) 3329 if (!s->memcg_params)
3330 return; 3330 return;
3331 if (!s->memcg_params->is_root_cache) 3331 if (!s->memcg_params->is_root_cache)
3332 return; 3332 return;
3333 3333
3334 /* 3334 /*
3335 * If the cache is being destroyed, we trust that there is no one else 3335 * If the cache is being destroyed, we trust that there is no one else
3336 * requesting objects from it. Even if there are, the sanity checks in 3336 * requesting objects from it. Even if there are, the sanity checks in
3337 * kmem_cache_destroy should caught this ill-case. 3337 * kmem_cache_destroy should caught this ill-case.
3338 * 3338 *
3339 * Still, we don't want anyone else freeing memcg_caches under our 3339 * Still, we don't want anyone else freeing memcg_caches under our
3340 * noses, which can happen if a new memcg comes to life. As usual, 3340 * noses, which can happen if a new memcg comes to life. As usual,
3341 * we'll take the activate_kmem_mutex to protect ourselves against 3341 * we'll take the activate_kmem_mutex to protect ourselves against
3342 * this. 3342 * this.
3343 */ 3343 */
3344 mutex_lock(&activate_kmem_mutex); 3344 mutex_lock(&activate_kmem_mutex);
3345 for_each_memcg_cache_index(i) { 3345 for_each_memcg_cache_index(i) {
3346 c = cache_from_memcg_idx(s, i); 3346 c = cache_from_memcg_idx(s, i);
3347 if (!c) 3347 if (!c)
3348 continue; 3348 continue;
3349 3349
3350 /* 3350 /*
3351 * We will now manually delete the caches, so to avoid races 3351 * We will now manually delete the caches, so to avoid races
3352 * we need to cancel all pending destruction workers and 3352 * we need to cancel all pending destruction workers and
3353 * proceed with destruction ourselves. 3353 * proceed with destruction ourselves.
3354 * 3354 *
3355 * kmem_cache_destroy() will call kmem_cache_shrink internally, 3355 * kmem_cache_destroy() will call kmem_cache_shrink internally,
3356 * and that could spawn the workers again: it is likely that 3356 * and that could spawn the workers again: it is likely that
3357 * the cache still have active pages until this very moment. 3357 * the cache still have active pages until this very moment.
3358 * This would lead us back to mem_cgroup_destroy_cache. 3358 * This would lead us back to mem_cgroup_destroy_cache.
3359 * 3359 *
3360 * But that will not execute at all if the "dead" flag is not 3360 * But that will not execute at all if the "dead" flag is not
3361 * set, so flip it down to guarantee we are in control. 3361 * set, so flip it down to guarantee we are in control.
3362 */ 3362 */
3363 c->memcg_params->dead = false; 3363 c->memcg_params->dead = false;
3364 cancel_work_sync(&c->memcg_params->destroy); 3364 cancel_work_sync(&c->memcg_params->destroy);
3365 kmem_cache_destroy(c); 3365 kmem_cache_destroy(c);
3366 } 3366 }
3367 mutex_unlock(&activate_kmem_mutex); 3367 mutex_unlock(&activate_kmem_mutex);
3368 } 3368 }
3369 3369
3370 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3370 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3371 { 3371 {
3372 struct kmem_cache *cachep; 3372 struct kmem_cache *cachep;
3373 struct memcg_cache_params *params; 3373 struct memcg_cache_params *params;
3374 3374
3375 if (!memcg_kmem_is_active(memcg)) 3375 if (!memcg_kmem_is_active(memcg))
3376 return; 3376 return;
3377 3377
3378 mutex_lock(&memcg->slab_caches_mutex); 3378 mutex_lock(&memcg->slab_caches_mutex);
3379 list_for_each_entry(params, &memcg->memcg_slab_caches, list) { 3379 list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
3380 cachep = memcg_params_to_cache(params); 3380 cachep = memcg_params_to_cache(params);
3381 cachep->memcg_params->dead = true; 3381 cachep->memcg_params->dead = true;
3382 schedule_work(&cachep->memcg_params->destroy); 3382 schedule_work(&cachep->memcg_params->destroy);
3383 } 3383 }
3384 mutex_unlock(&memcg->slab_caches_mutex); 3384 mutex_unlock(&memcg->slab_caches_mutex);
3385 } 3385 }
3386 3386
3387 struct create_work { 3387 struct create_work {
3388 struct mem_cgroup *memcg; 3388 struct mem_cgroup *memcg;
3389 struct kmem_cache *cachep; 3389 struct kmem_cache *cachep;
3390 struct work_struct work; 3390 struct work_struct work;
3391 }; 3391 };
3392 3392
3393 static void memcg_create_cache_work_func(struct work_struct *w) 3393 static void memcg_create_cache_work_func(struct work_struct *w)
3394 { 3394 {
3395 struct create_work *cw = container_of(w, struct create_work, work); 3395 struct create_work *cw = container_of(w, struct create_work, work);
3396 struct mem_cgroup *memcg = cw->memcg; 3396 struct mem_cgroup *memcg = cw->memcg;
3397 struct kmem_cache *cachep = cw->cachep; 3397 struct kmem_cache *cachep = cw->cachep;
3398 struct kmem_cache *new;
3399 3398
3400 new = kmem_cache_create_memcg(memcg, cachep->name, 3399 kmem_cache_create_memcg(memcg, cachep);
3401 cachep->object_size, cachep->align,
3402 cachep->flags & ~SLAB_PANIC, cachep->ctor, cachep);
3403 if (new)
3404 new->allocflags |= __GFP_KMEMCG;
3405 css_put(&memcg->css); 3400 css_put(&memcg->css);
3406 kfree(cw); 3401 kfree(cw);
3407 } 3402 }
3408 3403
3409 /* 3404 /*
3410 * Enqueue the creation of a per-memcg kmem_cache. 3405 * Enqueue the creation of a per-memcg kmem_cache.
3411 */ 3406 */
3412 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3407 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3413 struct kmem_cache *cachep) 3408 struct kmem_cache *cachep)
3414 { 3409 {
3415 struct create_work *cw; 3410 struct create_work *cw;
3416 3411
3417 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3412 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
3418 if (cw == NULL) { 3413 if (cw == NULL) {
3419 css_put(&memcg->css); 3414 css_put(&memcg->css);
3420 return; 3415 return;
3421 } 3416 }
3422 3417
3423 cw->memcg = memcg; 3418 cw->memcg = memcg;
3424 cw->cachep = cachep; 3419 cw->cachep = cachep;
3425 3420
3426 INIT_WORK(&cw->work, memcg_create_cache_work_func); 3421 INIT_WORK(&cw->work, memcg_create_cache_work_func);
3427 schedule_work(&cw->work); 3422 schedule_work(&cw->work);
3428 } 3423 }
3429 3424
3430 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3425 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
3431 struct kmem_cache *cachep) 3426 struct kmem_cache *cachep)
3432 { 3427 {
3433 /* 3428 /*
3434 * We need to stop accounting when we kmalloc, because if the 3429 * We need to stop accounting when we kmalloc, because if the
3435 * corresponding kmalloc cache is not yet created, the first allocation 3430 * corresponding kmalloc cache is not yet created, the first allocation
3436 * in __memcg_create_cache_enqueue will recurse. 3431 * in __memcg_create_cache_enqueue will recurse.
3437 * 3432 *
3438 * However, it is better to enclose the whole function. Depending on 3433 * However, it is better to enclose the whole function. Depending on
3439 * the debugging options enabled, INIT_WORK(), for instance, can 3434 * the debugging options enabled, INIT_WORK(), for instance, can
3440 * trigger an allocation. This too, will make us recurse. Because at 3435 * trigger an allocation. This too, will make us recurse. Because at
3441 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3436 * this point we can't allow ourselves back into memcg_kmem_get_cache,
3442 * the safest choice is to do it like this, wrapping the whole function. 3437 * the safest choice is to do it like this, wrapping the whole function.
3443 */ 3438 */
3444 memcg_stop_kmem_account(); 3439 memcg_stop_kmem_account();
3445 __memcg_create_cache_enqueue(memcg, cachep); 3440 __memcg_create_cache_enqueue(memcg, cachep);
3446 memcg_resume_kmem_account(); 3441 memcg_resume_kmem_account();
3447 } 3442 }
3448 /* 3443 /*
3449 * Return the kmem_cache we're supposed to use for a slab allocation. 3444 * Return the kmem_cache we're supposed to use for a slab allocation.
3450 * We try to use the current memcg's version of the cache. 3445 * We try to use the current memcg's version of the cache.
3451 * 3446 *
3452 * If the cache does not exist yet, if we are the first user of it, 3447 * If the cache does not exist yet, if we are the first user of it,
3453 * we either create it immediately, if possible, or create it asynchronously 3448 * we either create it immediately, if possible, or create it asynchronously
3454 * in a workqueue. 3449 * in a workqueue.
3455 * In the latter case, we will let the current allocation go through with 3450 * In the latter case, we will let the current allocation go through with
3456 * the original cache. 3451 * the original cache.
3457 * 3452 *
3458 * Can't be called in interrupt context or from kernel threads. 3453 * Can't be called in interrupt context or from kernel threads.
3459 * This function needs to be called with rcu_read_lock() held. 3454 * This function needs to be called with rcu_read_lock() held.
3460 */ 3455 */
3461 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3456 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
3462 gfp_t gfp) 3457 gfp_t gfp)
3463 { 3458 {
3464 struct mem_cgroup *memcg; 3459 struct mem_cgroup *memcg;
3465 struct kmem_cache *memcg_cachep; 3460 struct kmem_cache *memcg_cachep;
3466 3461
3467 VM_BUG_ON(!cachep->memcg_params); 3462 VM_BUG_ON(!cachep->memcg_params);
3468 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3463 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
3469 3464
3470 if (!current->mm || current->memcg_kmem_skip_account) 3465 if (!current->mm || current->memcg_kmem_skip_account)
3471 return cachep; 3466 return cachep;
3472 3467
3473 rcu_read_lock(); 3468 rcu_read_lock();
3474 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3469 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
3475 3470
3476 if (!memcg_can_account_kmem(memcg)) 3471 if (!memcg_can_account_kmem(memcg))
3477 goto out; 3472 goto out;
3478 3473
3479 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 3474 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
3480 if (likely(memcg_cachep)) { 3475 if (likely(memcg_cachep)) {
3481 cachep = memcg_cachep; 3476 cachep = memcg_cachep;
3482 goto out; 3477 goto out;
3483 } 3478 }
3484 3479
3485 /* The corresponding put will be done in the workqueue. */ 3480 /* The corresponding put will be done in the workqueue. */
3486 if (!css_tryget(&memcg->css)) 3481 if (!css_tryget(&memcg->css))
3487 goto out; 3482 goto out;
3488 rcu_read_unlock(); 3483 rcu_read_unlock();
3489 3484
3490 /* 3485 /*
3491 * If we are in a safe context (can wait, and not in interrupt 3486 * If we are in a safe context (can wait, and not in interrupt
3492 * context), we could be be predictable and return right away. 3487 * context), we could be be predictable and return right away.
3493 * This would guarantee that the allocation being performed 3488 * This would guarantee that the allocation being performed
3494 * already belongs in the new cache. 3489 * already belongs in the new cache.
3495 * 3490 *
3496 * However, there are some clashes that can arrive from locking. 3491 * However, there are some clashes that can arrive from locking.
3497 * For instance, because we acquire the slab_mutex while doing 3492 * For instance, because we acquire the slab_mutex while doing
3498 * kmem_cache_dup, this means no further allocation could happen 3493 * kmem_cache_dup, this means no further allocation could happen
3499 * with the slab_mutex held. 3494 * with the slab_mutex held.
3500 * 3495 *
3501 * Also, because cache creation issue get_online_cpus(), this 3496 * Also, because cache creation issue get_online_cpus(), this
3502 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, 3497 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
3503 * that ends up reversed during cpu hotplug. (cpuset allocates 3498 * that ends up reversed during cpu hotplug. (cpuset allocates
3504 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, 3499 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
3505 * better to defer everything. 3500 * better to defer everything.
3506 */ 3501 */
3507 memcg_create_cache_enqueue(memcg, cachep); 3502 memcg_create_cache_enqueue(memcg, cachep);
3508 return cachep; 3503 return cachep;
3509 out: 3504 out:
3510 rcu_read_unlock(); 3505 rcu_read_unlock();
3511 return cachep; 3506 return cachep;
3512 } 3507 }
3513 EXPORT_SYMBOL(__memcg_kmem_get_cache); 3508 EXPORT_SYMBOL(__memcg_kmem_get_cache);
3514 3509
3515 /* 3510 /*
3516 * We need to verify if the allocation against current->mm->owner's memcg is 3511 * We need to verify if the allocation against current->mm->owner's memcg is
3517 * possible for the given order. But the page is not allocated yet, so we'll 3512 * possible for the given order. But the page is not allocated yet, so we'll
3518 * need a further commit step to do the final arrangements. 3513 * need a further commit step to do the final arrangements.
3519 * 3514 *
3520 * It is possible for the task to switch cgroups in this mean time, so at 3515 * It is possible for the task to switch cgroups in this mean time, so at
3521 * commit time, we can't rely on task conversion any longer. We'll then use 3516 * commit time, we can't rely on task conversion any longer. We'll then use
3522 * the handle argument to return to the caller which cgroup we should commit 3517 * the handle argument to return to the caller which cgroup we should commit
3523 * against. We could also return the memcg directly and avoid the pointer 3518 * against. We could also return the memcg directly and avoid the pointer
3524 * passing, but a boolean return value gives better semantics considering 3519 * passing, but a boolean return value gives better semantics considering
3525 * the compiled-out case as well. 3520 * the compiled-out case as well.
3526 * 3521 *
3527 * Returning true means the allocation is possible. 3522 * Returning true means the allocation is possible.
3528 */ 3523 */
3529 bool 3524 bool
3530 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3525 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
3531 { 3526 {
3532 struct mem_cgroup *memcg; 3527 struct mem_cgroup *memcg;
3533 int ret; 3528 int ret;
3534 3529
3535 *_memcg = NULL; 3530 *_memcg = NULL;
3536 3531
3537 /* 3532 /*
3538 * Disabling accounting is only relevant for some specific memcg 3533 * Disabling accounting is only relevant for some specific memcg
3539 * internal allocations. Therefore we would initially not have such 3534 * internal allocations. Therefore we would initially not have such
3540 * check here, since direct calls to the page allocator that are marked 3535 * check here, since direct calls to the page allocator that are marked
3541 * with GFP_KMEMCG only happen outside memcg core. We are mostly 3536 * with GFP_KMEMCG only happen outside memcg core. We are mostly
3542 * concerned with cache allocations, and by having this test at 3537 * concerned with cache allocations, and by having this test at
3543 * memcg_kmem_get_cache, we are already able to relay the allocation to 3538 * memcg_kmem_get_cache, we are already able to relay the allocation to
3544 * the root cache and bypass the memcg cache altogether. 3539 * the root cache and bypass the memcg cache altogether.
3545 * 3540 *
3546 * There is one exception, though: the SLUB allocator does not create 3541 * There is one exception, though: the SLUB allocator does not create
3547 * large order caches, but rather service large kmallocs directly from 3542 * large order caches, but rather service large kmallocs directly from
3548 * the page allocator. Therefore, the following sequence when backed by 3543 * the page allocator. Therefore, the following sequence when backed by
3549 * the SLUB allocator: 3544 * the SLUB allocator:
3550 * 3545 *
3551 * memcg_stop_kmem_account(); 3546 * memcg_stop_kmem_account();
3552 * kmalloc(<large_number>) 3547 * kmalloc(<large_number>)
3553 * memcg_resume_kmem_account(); 3548 * memcg_resume_kmem_account();
3554 * 3549 *
3555 * would effectively ignore the fact that we should skip accounting, 3550 * would effectively ignore the fact that we should skip accounting,
3556 * since it will drive us directly to this function without passing 3551 * since it will drive us directly to this function without passing
3557 * through the cache selector memcg_kmem_get_cache. Such large 3552 * through the cache selector memcg_kmem_get_cache. Such large
3558 * allocations are extremely rare but can happen, for instance, for the 3553 * allocations are extremely rare but can happen, for instance, for the
3559 * cache arrays. We bring this test here. 3554 * cache arrays. We bring this test here.
3560 */ 3555 */
3561 if (!current->mm || current->memcg_kmem_skip_account) 3556 if (!current->mm || current->memcg_kmem_skip_account)
3562 return true; 3557 return true;
3563 3558
3564 memcg = get_mem_cgroup_from_mm(current->mm); 3559 memcg = get_mem_cgroup_from_mm(current->mm);
3565 3560
3566 if (!memcg_can_account_kmem(memcg)) { 3561 if (!memcg_can_account_kmem(memcg)) {
3567 css_put(&memcg->css); 3562 css_put(&memcg->css);
3568 return true; 3563 return true;
3569 } 3564 }
3570 3565
3571 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3566 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
3572 if (!ret) 3567 if (!ret)
3573 *_memcg = memcg; 3568 *_memcg = memcg;
3574 3569
3575 css_put(&memcg->css); 3570 css_put(&memcg->css);
3576 return (ret == 0); 3571 return (ret == 0);
3577 } 3572 }
3578 3573
3579 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3574 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
3580 int order) 3575 int order)
3581 { 3576 {
3582 struct page_cgroup *pc; 3577 struct page_cgroup *pc;
3583 3578
3584 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3579 VM_BUG_ON(mem_cgroup_is_root(memcg));
3585 3580
3586 /* The page allocation failed. Revert */ 3581 /* The page allocation failed. Revert */
3587 if (!page) { 3582 if (!page) {
3588 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3583 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3589 return; 3584 return;
3590 } 3585 }
3591 3586
3592 pc = lookup_page_cgroup(page); 3587 pc = lookup_page_cgroup(page);
3593 lock_page_cgroup(pc); 3588 lock_page_cgroup(pc);
3594 pc->mem_cgroup = memcg; 3589 pc->mem_cgroup = memcg;
3595 SetPageCgroupUsed(pc); 3590 SetPageCgroupUsed(pc);
3596 unlock_page_cgroup(pc); 3591 unlock_page_cgroup(pc);
3597 } 3592 }
3598 3593
3599 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3594 void __memcg_kmem_uncharge_pages(struct page *page, int order)
3600 { 3595 {
3601 struct mem_cgroup *memcg = NULL; 3596 struct mem_cgroup *memcg = NULL;
3602 struct page_cgroup *pc; 3597 struct page_cgroup *pc;
3603 3598
3604 3599
3605 pc = lookup_page_cgroup(page); 3600 pc = lookup_page_cgroup(page);
3606 /* 3601 /*
3607 * Fast unlocked return. Theoretically might have changed, have to 3602 * Fast unlocked return. Theoretically might have changed, have to
3608 * check again after locking. 3603 * check again after locking.
3609 */ 3604 */
3610 if (!PageCgroupUsed(pc)) 3605 if (!PageCgroupUsed(pc))
3611 return; 3606 return;
3612 3607
3613 lock_page_cgroup(pc); 3608 lock_page_cgroup(pc);
3614 if (PageCgroupUsed(pc)) { 3609 if (PageCgroupUsed(pc)) {
3615 memcg = pc->mem_cgroup; 3610 memcg = pc->mem_cgroup;
3616 ClearPageCgroupUsed(pc); 3611 ClearPageCgroupUsed(pc);
3617 } 3612 }
3618 unlock_page_cgroup(pc); 3613 unlock_page_cgroup(pc);
3619 3614
3620 /* 3615 /*
3621 * We trust that only if there is a memcg associated with the page, it 3616 * We trust that only if there is a memcg associated with the page, it
3622 * is a valid allocation 3617 * is a valid allocation
3623 */ 3618 */
3624 if (!memcg) 3619 if (!memcg)
3625 return; 3620 return;
3626 3621
3627 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 3622 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
3628 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3623 memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
3629 } 3624 }
3630 #else 3625 #else
3631 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3626 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
3632 { 3627 {
3633 } 3628 }
3634 #endif /* CONFIG_MEMCG_KMEM */ 3629 #endif /* CONFIG_MEMCG_KMEM */
3635 3630
3636 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3631 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3637 3632
3638 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3633 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
3639 /* 3634 /*
3640 * Because tail pages are not marked as "used", set it. We're under 3635 * Because tail pages are not marked as "used", set it. We're under
3641 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3636 * zone->lru_lock, 'splitting on pmd' and compound_lock.
3642 * charge/uncharge will be never happen and move_account() is done under 3637 * charge/uncharge will be never happen and move_account() is done under
3643 * compound_lock(), so we don't have to take care of races. 3638 * compound_lock(), so we don't have to take care of races.
3644 */ 3639 */
3645 void mem_cgroup_split_huge_fixup(struct page *head) 3640 void mem_cgroup_split_huge_fixup(struct page *head)
3646 { 3641 {
3647 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3642 struct page_cgroup *head_pc = lookup_page_cgroup(head);
3648 struct page_cgroup *pc; 3643 struct page_cgroup *pc;
3649 struct mem_cgroup *memcg; 3644 struct mem_cgroup *memcg;
3650 int i; 3645 int i;
3651 3646
3652 if (mem_cgroup_disabled()) 3647 if (mem_cgroup_disabled())
3653 return; 3648 return;
3654 3649
3655 memcg = head_pc->mem_cgroup; 3650 memcg = head_pc->mem_cgroup;
3656 for (i = 1; i < HPAGE_PMD_NR; i++) { 3651 for (i = 1; i < HPAGE_PMD_NR; i++) {
3657 pc = head_pc + i; 3652 pc = head_pc + i;
3658 pc->mem_cgroup = memcg; 3653 pc->mem_cgroup = memcg;
3659 smp_wmb();/* see __commit_charge() */ 3654 smp_wmb();/* see __commit_charge() */
3660 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3655 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
3661 } 3656 }
3662 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3657 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3663 HPAGE_PMD_NR); 3658 HPAGE_PMD_NR);
3664 } 3659 }
3665 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3660 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
3666 3661
3667 /** 3662 /**
3668 * mem_cgroup_move_account - move account of the page 3663 * mem_cgroup_move_account - move account of the page
3669 * @page: the page 3664 * @page: the page
3670 * @nr_pages: number of regular pages (>1 for huge pages) 3665 * @nr_pages: number of regular pages (>1 for huge pages)
3671 * @pc: page_cgroup of the page. 3666 * @pc: page_cgroup of the page.
3672 * @from: mem_cgroup which the page is moved from. 3667 * @from: mem_cgroup which the page is moved from.
3673 * @to: mem_cgroup which the page is moved to. @from != @to. 3668 * @to: mem_cgroup which the page is moved to. @from != @to.
3674 * 3669 *
3675 * The caller must confirm following. 3670 * The caller must confirm following.
3676 * - page is not on LRU (isolate_page() is useful.) 3671 * - page is not on LRU (isolate_page() is useful.)
3677 * - compound_lock is held when nr_pages > 1 3672 * - compound_lock is held when nr_pages > 1
3678 * 3673 *
3679 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3674 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
3680 * from old cgroup. 3675 * from old cgroup.
3681 */ 3676 */
3682 static int mem_cgroup_move_account(struct page *page, 3677 static int mem_cgroup_move_account(struct page *page,
3683 unsigned int nr_pages, 3678 unsigned int nr_pages,
3684 struct page_cgroup *pc, 3679 struct page_cgroup *pc,
3685 struct mem_cgroup *from, 3680 struct mem_cgroup *from,
3686 struct mem_cgroup *to) 3681 struct mem_cgroup *to)
3687 { 3682 {
3688 unsigned long flags; 3683 unsigned long flags;
3689 int ret; 3684 int ret;
3690 bool anon = PageAnon(page); 3685 bool anon = PageAnon(page);
3691 3686
3692 VM_BUG_ON(from == to); 3687 VM_BUG_ON(from == to);
3693 VM_BUG_ON_PAGE(PageLRU(page), page); 3688 VM_BUG_ON_PAGE(PageLRU(page), page);
3694 /* 3689 /*
3695 * The page is isolated from LRU. So, collapse function 3690 * The page is isolated from LRU. So, collapse function
3696 * will not handle this page. But page splitting can happen. 3691 * will not handle this page. But page splitting can happen.
3697 * Do this check under compound_page_lock(). The caller should 3692 * Do this check under compound_page_lock(). The caller should
3698 * hold it. 3693 * hold it.
3699 */ 3694 */
3700 ret = -EBUSY; 3695 ret = -EBUSY;
3701 if (nr_pages > 1 && !PageTransHuge(page)) 3696 if (nr_pages > 1 && !PageTransHuge(page))
3702 goto out; 3697 goto out;
3703 3698
3704 lock_page_cgroup(pc); 3699 lock_page_cgroup(pc);
3705 3700
3706 ret = -EINVAL; 3701 ret = -EINVAL;
3707 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3702 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
3708 goto unlock; 3703 goto unlock;
3709 3704
3710 move_lock_mem_cgroup(from, &flags); 3705 move_lock_mem_cgroup(from, &flags);
3711 3706
3712 if (!anon && page_mapped(page)) { 3707 if (!anon && page_mapped(page)) {
3713 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3708 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3714 nr_pages); 3709 nr_pages);
3715 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], 3710 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
3716 nr_pages); 3711 nr_pages);
3717 } 3712 }
3718 3713
3719 if (PageWriteback(page)) { 3714 if (PageWriteback(page)) {
3720 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3715 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3721 nr_pages); 3716 nr_pages);
3722 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], 3717 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
3723 nr_pages); 3718 nr_pages);
3724 } 3719 }
3725 3720
3726 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3721 mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
3727 3722
3728 /* caller should have done css_get */ 3723 /* caller should have done css_get */
3729 pc->mem_cgroup = to; 3724 pc->mem_cgroup = to;
3730 mem_cgroup_charge_statistics(to, page, anon, nr_pages); 3725 mem_cgroup_charge_statistics(to, page, anon, nr_pages);
3731 move_unlock_mem_cgroup(from, &flags); 3726 move_unlock_mem_cgroup(from, &flags);
3732 ret = 0; 3727 ret = 0;
3733 unlock: 3728 unlock:
3734 unlock_page_cgroup(pc); 3729 unlock_page_cgroup(pc);
3735 /* 3730 /*
3736 * check events 3731 * check events
3737 */ 3732 */
3738 memcg_check_events(to, page); 3733 memcg_check_events(to, page);
3739 memcg_check_events(from, page); 3734 memcg_check_events(from, page);
3740 out: 3735 out:
3741 return ret; 3736 return ret;
3742 } 3737 }
3743 3738
3744 /** 3739 /**
3745 * mem_cgroup_move_parent - moves page to the parent group 3740 * mem_cgroup_move_parent - moves page to the parent group
3746 * @page: the page to move 3741 * @page: the page to move
3747 * @pc: page_cgroup of the page 3742 * @pc: page_cgroup of the page
3748 * @child: page's cgroup 3743 * @child: page's cgroup
3749 * 3744 *
3750 * move charges to its parent or the root cgroup if the group has no 3745 * move charges to its parent or the root cgroup if the group has no
3751 * parent (aka use_hierarchy==0). 3746 * parent (aka use_hierarchy==0).
3752 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3747 * Although this might fail (get_page_unless_zero, isolate_lru_page or
3753 * mem_cgroup_move_account fails) the failure is always temporary and 3748 * mem_cgroup_move_account fails) the failure is always temporary and
3754 * it signals a race with a page removal/uncharge or migration. In the 3749 * it signals a race with a page removal/uncharge or migration. In the
3755 * first case the page is on the way out and it will vanish from the LRU 3750 * first case the page is on the way out and it will vanish from the LRU
3756 * on the next attempt and the call should be retried later. 3751 * on the next attempt and the call should be retried later.
3757 * Isolation from the LRU fails only if page has been isolated from 3752 * Isolation from the LRU fails only if page has been isolated from
3758 * the LRU since we looked at it and that usually means either global 3753 * the LRU since we looked at it and that usually means either global
3759 * reclaim or migration going on. The page will either get back to the 3754 * reclaim or migration going on. The page will either get back to the
3760 * LRU or vanish. 3755 * LRU or vanish.
3761 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3756 * Finaly mem_cgroup_move_account fails only if the page got uncharged
3762 * (!PageCgroupUsed) or moved to a different group. The page will 3757 * (!PageCgroupUsed) or moved to a different group. The page will
3763 * disappear in the next attempt. 3758 * disappear in the next attempt.
3764 */ 3759 */
3765 static int mem_cgroup_move_parent(struct page *page, 3760 static int mem_cgroup_move_parent(struct page *page,
3766 struct page_cgroup *pc, 3761 struct page_cgroup *pc,
3767 struct mem_cgroup *child) 3762 struct mem_cgroup *child)
3768 { 3763 {
3769 struct mem_cgroup *parent; 3764 struct mem_cgroup *parent;
3770 unsigned int nr_pages; 3765 unsigned int nr_pages;
3771 unsigned long uninitialized_var(flags); 3766 unsigned long uninitialized_var(flags);
3772 int ret; 3767 int ret;
3773 3768
3774 VM_BUG_ON(mem_cgroup_is_root(child)); 3769 VM_BUG_ON(mem_cgroup_is_root(child));
3775 3770
3776 ret = -EBUSY; 3771 ret = -EBUSY;
3777 if (!get_page_unless_zero(page)) 3772 if (!get_page_unless_zero(page))
3778 goto out; 3773 goto out;
3779 if (isolate_lru_page(page)) 3774 if (isolate_lru_page(page))
3780 goto put; 3775 goto put;
3781 3776
3782 nr_pages = hpage_nr_pages(page); 3777 nr_pages = hpage_nr_pages(page);
3783 3778
3784 parent = parent_mem_cgroup(child); 3779 parent = parent_mem_cgroup(child);
3785 /* 3780 /*
3786 * If no parent, move charges to root cgroup. 3781 * If no parent, move charges to root cgroup.
3787 */ 3782 */
3788 if (!parent) 3783 if (!parent)
3789 parent = root_mem_cgroup; 3784 parent = root_mem_cgroup;
3790 3785
3791 if (nr_pages > 1) { 3786 if (nr_pages > 1) {
3792 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3787 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3793 flags = compound_lock_irqsave(page); 3788 flags = compound_lock_irqsave(page);
3794 } 3789 }
3795 3790
3796 ret = mem_cgroup_move_account(page, nr_pages, 3791 ret = mem_cgroup_move_account(page, nr_pages,
3797 pc, child, parent); 3792 pc, child, parent);
3798 if (!ret) 3793 if (!ret)
3799 __mem_cgroup_cancel_local_charge(child, nr_pages); 3794 __mem_cgroup_cancel_local_charge(child, nr_pages);
3800 3795
3801 if (nr_pages > 1) 3796 if (nr_pages > 1)
3802 compound_unlock_irqrestore(page, flags); 3797 compound_unlock_irqrestore(page, flags);
3803 putback_lru_page(page); 3798 putback_lru_page(page);
3804 put: 3799 put:
3805 put_page(page); 3800 put_page(page);
3806 out: 3801 out:
3807 return ret; 3802 return ret;
3808 } 3803 }
3809 3804
3810 int mem_cgroup_charge_anon(struct page *page, 3805 int mem_cgroup_charge_anon(struct page *page,
3811 struct mm_struct *mm, gfp_t gfp_mask) 3806 struct mm_struct *mm, gfp_t gfp_mask)
3812 { 3807 {
3813 unsigned int nr_pages = 1; 3808 unsigned int nr_pages = 1;
3814 struct mem_cgroup *memcg; 3809 struct mem_cgroup *memcg;
3815 bool oom = true; 3810 bool oom = true;
3816 3811
3817 if (mem_cgroup_disabled()) 3812 if (mem_cgroup_disabled())
3818 return 0; 3813 return 0;
3819 3814
3820 VM_BUG_ON_PAGE(page_mapped(page), page); 3815 VM_BUG_ON_PAGE(page_mapped(page), page);
3821 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); 3816 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
3822 VM_BUG_ON(!mm); 3817 VM_BUG_ON(!mm);
3823 3818
3824 if (PageTransHuge(page)) { 3819 if (PageTransHuge(page)) {
3825 nr_pages <<= compound_order(page); 3820 nr_pages <<= compound_order(page);
3826 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 3821 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
3827 /* 3822 /*
3828 * Never OOM-kill a process for a huge page. The 3823 * Never OOM-kill a process for a huge page. The
3829 * fault handler will fall back to regular pages. 3824 * fault handler will fall back to regular pages.
3830 */ 3825 */
3831 oom = false; 3826 oom = false;
3832 } 3827 }
3833 3828
3834 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); 3829 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
3835 if (!memcg) 3830 if (!memcg)
3836 return -ENOMEM; 3831 return -ENOMEM;
3837 __mem_cgroup_commit_charge(memcg, page, nr_pages, 3832 __mem_cgroup_commit_charge(memcg, page, nr_pages,
3838 MEM_CGROUP_CHARGE_TYPE_ANON, false); 3833 MEM_CGROUP_CHARGE_TYPE_ANON, false);
3839 return 0; 3834 return 0;
3840 } 3835 }
3841 3836
3842 /* 3837 /*
3843 * While swap-in, try_charge -> commit or cancel, the page is locked. 3838 * While swap-in, try_charge -> commit or cancel, the page is locked.
3844 * And when try_charge() successfully returns, one refcnt to memcg without 3839 * And when try_charge() successfully returns, one refcnt to memcg without
3845 * struct page_cgroup is acquired. This refcnt will be consumed by 3840 * struct page_cgroup is acquired. This refcnt will be consumed by
3846 * "commit()" or removed by "cancel()" 3841 * "commit()" or removed by "cancel()"
3847 */ 3842 */
3848 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, 3843 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
3849 struct page *page, 3844 struct page *page,
3850 gfp_t mask, 3845 gfp_t mask,
3851 struct mem_cgroup **memcgp) 3846 struct mem_cgroup **memcgp)
3852 { 3847 {
3853 struct mem_cgroup *memcg = NULL; 3848 struct mem_cgroup *memcg = NULL;
3854 struct page_cgroup *pc; 3849 struct page_cgroup *pc;
3855 int ret; 3850 int ret;
3856 3851
3857 pc = lookup_page_cgroup(page); 3852 pc = lookup_page_cgroup(page);
3858 /* 3853 /*
3859 * Every swap fault against a single page tries to charge the 3854 * Every swap fault against a single page tries to charge the
3860 * page, bail as early as possible. shmem_unuse() encounters 3855 * page, bail as early as possible. shmem_unuse() encounters
3861 * already charged pages, too. The USED bit is protected by 3856 * already charged pages, too. The USED bit is protected by
3862 * the page lock, which serializes swap cache removal, which 3857 * the page lock, which serializes swap cache removal, which
3863 * in turn serializes uncharging. 3858 * in turn serializes uncharging.
3864 */ 3859 */
3865 if (PageCgroupUsed(pc)) 3860 if (PageCgroupUsed(pc))
3866 goto out; 3861 goto out;
3867 if (do_swap_account) 3862 if (do_swap_account)
3868 memcg = try_get_mem_cgroup_from_page(page); 3863 memcg = try_get_mem_cgroup_from_page(page);
3869 if (!memcg) 3864 if (!memcg)
3870 memcg = get_mem_cgroup_from_mm(mm); 3865 memcg = get_mem_cgroup_from_mm(mm);
3871 ret = mem_cgroup_try_charge(memcg, mask, 1, true); 3866 ret = mem_cgroup_try_charge(memcg, mask, 1, true);
3872 css_put(&memcg->css); 3867 css_put(&memcg->css);
3873 if (ret == -EINTR) 3868 if (ret == -EINTR)
3874 memcg = root_mem_cgroup; 3869 memcg = root_mem_cgroup;
3875 else if (ret) 3870 else if (ret)
3876 return ret; 3871 return ret;
3877 out: 3872 out:
3878 *memcgp = memcg; 3873 *memcgp = memcg;
3879 return 0; 3874 return 0;
3880 } 3875 }
3881 3876
3882 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, 3877 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
3883 gfp_t gfp_mask, struct mem_cgroup **memcgp) 3878 gfp_t gfp_mask, struct mem_cgroup **memcgp)
3884 { 3879 {
3885 if (mem_cgroup_disabled()) { 3880 if (mem_cgroup_disabled()) {
3886 *memcgp = NULL; 3881 *memcgp = NULL;
3887 return 0; 3882 return 0;
3888 } 3883 }
3889 /* 3884 /*
3890 * A racing thread's fault, or swapoff, may have already 3885 * A racing thread's fault, or swapoff, may have already
3891 * updated the pte, and even removed page from swap cache: in 3886 * updated the pte, and even removed page from swap cache: in
3892 * those cases unuse_pte()'s pte_same() test will fail; but 3887 * those cases unuse_pte()'s pte_same() test will fail; but
3893 * there's also a KSM case which does need to charge the page. 3888 * there's also a KSM case which does need to charge the page.
3894 */ 3889 */
3895 if (!PageSwapCache(page)) { 3890 if (!PageSwapCache(page)) {
3896 struct mem_cgroup *memcg; 3891 struct mem_cgroup *memcg;
3897 3892
3898 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); 3893 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
3899 if (!memcg) 3894 if (!memcg)
3900 return -ENOMEM; 3895 return -ENOMEM;
3901 *memcgp = memcg; 3896 *memcgp = memcg;
3902 return 0; 3897 return 0;
3903 } 3898 }
3904 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); 3899 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
3905 } 3900 }
3906 3901
3907 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 3902 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
3908 { 3903 {
3909 if (mem_cgroup_disabled()) 3904 if (mem_cgroup_disabled())
3910 return; 3905 return;
3911 if (!memcg) 3906 if (!memcg)
3912 return; 3907 return;
3913 __mem_cgroup_cancel_charge(memcg, 1); 3908 __mem_cgroup_cancel_charge(memcg, 1);
3914 } 3909 }
3915 3910
3916 static void 3911 static void
3917 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 3912 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
3918 enum charge_type ctype) 3913 enum charge_type ctype)
3919 { 3914 {
3920 if (mem_cgroup_disabled()) 3915 if (mem_cgroup_disabled())
3921 return; 3916 return;
3922 if (!memcg) 3917 if (!memcg)
3923 return; 3918 return;
3924 3919
3925 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 3920 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
3926 /* 3921 /*
3927 * Now swap is on-memory. This means this page may be 3922 * Now swap is on-memory. This means this page may be
3928 * counted both as mem and swap....double count. 3923 * counted both as mem and swap....double count.
3929 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 3924 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
3930 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 3925 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
3931 * may call delete_from_swap_cache() before reach here. 3926 * may call delete_from_swap_cache() before reach here.
3932 */ 3927 */
3933 if (do_swap_account && PageSwapCache(page)) { 3928 if (do_swap_account && PageSwapCache(page)) {
3934 swp_entry_t ent = {.val = page_private(page)}; 3929 swp_entry_t ent = {.val = page_private(page)};
3935 mem_cgroup_uncharge_swap(ent); 3930 mem_cgroup_uncharge_swap(ent);
3936 } 3931 }
3937 } 3932 }
3938 3933
3939 void mem_cgroup_commit_charge_swapin(struct page *page, 3934 void mem_cgroup_commit_charge_swapin(struct page *page,
3940 struct mem_cgroup *memcg) 3935 struct mem_cgroup *memcg)
3941 { 3936 {
3942 __mem_cgroup_commit_charge_swapin(page, memcg, 3937 __mem_cgroup_commit_charge_swapin(page, memcg,
3943 MEM_CGROUP_CHARGE_TYPE_ANON); 3938 MEM_CGROUP_CHARGE_TYPE_ANON);
3944 } 3939 }
3945 3940
3946 int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, 3941 int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,
3947 gfp_t gfp_mask) 3942 gfp_t gfp_mask)
3948 { 3943 {
3949 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3944 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3950 struct mem_cgroup *memcg; 3945 struct mem_cgroup *memcg;
3951 int ret; 3946 int ret;
3952 3947
3953 if (mem_cgroup_disabled()) 3948 if (mem_cgroup_disabled())
3954 return 0; 3949 return 0;
3955 if (PageCompound(page)) 3950 if (PageCompound(page))
3956 return 0; 3951 return 0;
3957 3952
3958 if (PageSwapCache(page)) { /* shmem */ 3953 if (PageSwapCache(page)) { /* shmem */
3959 ret = __mem_cgroup_try_charge_swapin(mm, page, 3954 ret = __mem_cgroup_try_charge_swapin(mm, page,
3960 gfp_mask, &memcg); 3955 gfp_mask, &memcg);
3961 if (ret) 3956 if (ret)
3962 return ret; 3957 return ret;
3963 __mem_cgroup_commit_charge_swapin(page, memcg, type); 3958 __mem_cgroup_commit_charge_swapin(page, memcg, type);
3964 return 0; 3959 return 0;
3965 } 3960 }
3966 3961
3967 /* 3962 /*
3968 * Page cache insertions can happen without an actual mm 3963 * Page cache insertions can happen without an actual mm
3969 * context, e.g. during disk probing on boot. 3964 * context, e.g. during disk probing on boot.
3970 */ 3965 */
3971 if (unlikely(!mm)) 3966 if (unlikely(!mm))
3972 memcg = root_mem_cgroup; 3967 memcg = root_mem_cgroup;
3973 else { 3968 else {
3974 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); 3969 memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true);
3975 if (!memcg) 3970 if (!memcg)
3976 return -ENOMEM; 3971 return -ENOMEM;
3977 } 3972 }
3978 __mem_cgroup_commit_charge(memcg, page, 1, type, false); 3973 __mem_cgroup_commit_charge(memcg, page, 1, type, false);
3979 return 0; 3974 return 0;
3980 } 3975 }
3981 3976
3982 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 3977 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
3983 unsigned int nr_pages, 3978 unsigned int nr_pages,
3984 const enum charge_type ctype) 3979 const enum charge_type ctype)
3985 { 3980 {
3986 struct memcg_batch_info *batch = NULL; 3981 struct memcg_batch_info *batch = NULL;
3987 bool uncharge_memsw = true; 3982 bool uncharge_memsw = true;
3988 3983
3989 /* If swapout, usage of swap doesn't decrease */ 3984 /* If swapout, usage of swap doesn't decrease */
3990 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 3985 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
3991 uncharge_memsw = false; 3986 uncharge_memsw = false;
3992 3987
3993 batch = &current->memcg_batch; 3988 batch = &current->memcg_batch;
3994 /* 3989 /*
3995 * In usual, we do css_get() when we remember memcg pointer. 3990 * In usual, we do css_get() when we remember memcg pointer.
3996 * But in this case, we keep res->usage until end of a series of 3991 * But in this case, we keep res->usage until end of a series of
3997 * uncharges. Then, it's ok to ignore memcg's refcnt. 3992 * uncharges. Then, it's ok to ignore memcg's refcnt.
3998 */ 3993 */
3999 if (!batch->memcg) 3994 if (!batch->memcg)
4000 batch->memcg = memcg; 3995 batch->memcg = memcg;
4001 /* 3996 /*
4002 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 3997 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
4003 * In those cases, all pages freed continuously can be expected to be in 3998 * In those cases, all pages freed continuously can be expected to be in
4004 * the same cgroup and we have chance to coalesce uncharges. 3999 * the same cgroup and we have chance to coalesce uncharges.
4005 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 4000 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
4006 * because we want to do uncharge as soon as possible. 4001 * because we want to do uncharge as soon as possible.
4007 */ 4002 */
4008 4003
4009 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 4004 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
4010 goto direct_uncharge; 4005 goto direct_uncharge;
4011 4006
4012 if (nr_pages > 1) 4007 if (nr_pages > 1)
4013 goto direct_uncharge; 4008 goto direct_uncharge;
4014 4009
4015 /* 4010 /*
4016 * In typical case, batch->memcg == mem. This means we can 4011 * In typical case, batch->memcg == mem. This means we can
4017 * merge a series of uncharges to an uncharge of res_counter. 4012 * merge a series of uncharges to an uncharge of res_counter.
4018 * If not, we uncharge res_counter ony by one. 4013 * If not, we uncharge res_counter ony by one.
4019 */ 4014 */
4020 if (batch->memcg != memcg) 4015 if (batch->memcg != memcg)
4021 goto direct_uncharge; 4016 goto direct_uncharge;
4022 /* remember freed charge and uncharge it later */ 4017 /* remember freed charge and uncharge it later */
4023 batch->nr_pages++; 4018 batch->nr_pages++;
4024 if (uncharge_memsw) 4019 if (uncharge_memsw)
4025 batch->memsw_nr_pages++; 4020 batch->memsw_nr_pages++;
4026 return; 4021 return;
4027 direct_uncharge: 4022 direct_uncharge:
4028 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 4023 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
4029 if (uncharge_memsw) 4024 if (uncharge_memsw)
4030 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 4025 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
4031 if (unlikely(batch->memcg != memcg)) 4026 if (unlikely(batch->memcg != memcg))
4032 memcg_oom_recover(memcg); 4027 memcg_oom_recover(memcg);
4033 } 4028 }
4034 4029
4035 /* 4030 /*
4036 * uncharge if !page_mapped(page) 4031 * uncharge if !page_mapped(page)
4037 */ 4032 */
4038 static struct mem_cgroup * 4033 static struct mem_cgroup *
4039 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, 4034 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
4040 bool end_migration) 4035 bool end_migration)
4041 { 4036 {
4042 struct mem_cgroup *memcg = NULL; 4037 struct mem_cgroup *memcg = NULL;
4043 unsigned int nr_pages = 1; 4038 unsigned int nr_pages = 1;
4044 struct page_cgroup *pc; 4039 struct page_cgroup *pc;
4045 bool anon; 4040 bool anon;
4046 4041
4047 if (mem_cgroup_disabled()) 4042 if (mem_cgroup_disabled())
4048 return NULL; 4043 return NULL;
4049 4044
4050 if (PageTransHuge(page)) { 4045 if (PageTransHuge(page)) {
4051 nr_pages <<= compound_order(page); 4046 nr_pages <<= compound_order(page);
4052 VM_BUG_ON_PAGE(!PageTransHuge(page), page); 4047 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
4053 } 4048 }
4054 /* 4049 /*
4055 * Check if our page_cgroup is valid 4050 * Check if our page_cgroup is valid
4056 */ 4051 */
4057 pc = lookup_page_cgroup(page); 4052 pc = lookup_page_cgroup(page);
4058 if (unlikely(!PageCgroupUsed(pc))) 4053 if (unlikely(!PageCgroupUsed(pc)))
4059 return NULL; 4054 return NULL;
4060 4055
4061 lock_page_cgroup(pc); 4056 lock_page_cgroup(pc);
4062 4057
4063 memcg = pc->mem_cgroup; 4058 memcg = pc->mem_cgroup;
4064 4059
4065 if (!PageCgroupUsed(pc)) 4060 if (!PageCgroupUsed(pc))
4066 goto unlock_out; 4061 goto unlock_out;
4067 4062
4068 anon = PageAnon(page); 4063 anon = PageAnon(page);
4069 4064
4070 switch (ctype) { 4065 switch (ctype) {
4071 case MEM_CGROUP_CHARGE_TYPE_ANON: 4066 case MEM_CGROUP_CHARGE_TYPE_ANON:
4072 /* 4067 /*
4073 * Generally PageAnon tells if it's the anon statistics to be 4068 * Generally PageAnon tells if it's the anon statistics to be
4074 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 4069 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
4075 * used before page reached the stage of being marked PageAnon. 4070 * used before page reached the stage of being marked PageAnon.
4076 */ 4071 */
4077 anon = true; 4072 anon = true;
4078 /* fallthrough */ 4073 /* fallthrough */
4079 case MEM_CGROUP_CHARGE_TYPE_DROP: 4074 case MEM_CGROUP_CHARGE_TYPE_DROP:
4080 /* See mem_cgroup_prepare_migration() */ 4075 /* See mem_cgroup_prepare_migration() */
4081 if (page_mapped(page)) 4076 if (page_mapped(page))
4082 goto unlock_out; 4077 goto unlock_out;
4083 /* 4078 /*
4084 * Pages under migration may not be uncharged. But 4079 * Pages under migration may not be uncharged. But
4085 * end_migration() /must/ be the one uncharging the 4080 * end_migration() /must/ be the one uncharging the
4086 * unused post-migration page and so it has to call 4081 * unused post-migration page and so it has to call
4087 * here with the migration bit still set. See the 4082 * here with the migration bit still set. See the
4088 * res_counter handling below. 4083 * res_counter handling below.
4089 */ 4084 */
4090 if (!end_migration && PageCgroupMigration(pc)) 4085 if (!end_migration && PageCgroupMigration(pc))
4091 goto unlock_out; 4086 goto unlock_out;
4092 break; 4087 break;
4093 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 4088 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
4094 if (!PageAnon(page)) { /* Shared memory */ 4089 if (!PageAnon(page)) { /* Shared memory */
4095 if (page->mapping && !page_is_file_cache(page)) 4090 if (page->mapping && !page_is_file_cache(page))
4096 goto unlock_out; 4091 goto unlock_out;
4097 } else if (page_mapped(page)) /* Anon */ 4092 } else if (page_mapped(page)) /* Anon */
4098 goto unlock_out; 4093 goto unlock_out;
4099 break; 4094 break;
4100 default: 4095 default:
4101 break; 4096 break;
4102 } 4097 }
4103 4098
4104 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); 4099 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
4105 4100
4106 ClearPageCgroupUsed(pc); 4101 ClearPageCgroupUsed(pc);
4107 /* 4102 /*
4108 * pc->mem_cgroup is not cleared here. It will be accessed when it's 4103 * pc->mem_cgroup is not cleared here. It will be accessed when it's
4109 * freed from LRU. This is safe because uncharged page is expected not 4104 * freed from LRU. This is safe because uncharged page is expected not
4110 * to be reused (freed soon). Exception is SwapCache, it's handled by 4105 * to be reused (freed soon). Exception is SwapCache, it's handled by
4111 * special functions. 4106 * special functions.
4112 */ 4107 */
4113 4108
4114 unlock_page_cgroup(pc); 4109 unlock_page_cgroup(pc);
4115 /* 4110 /*
4116 * even after unlock, we have memcg->res.usage here and this memcg 4111 * even after unlock, we have memcg->res.usage here and this memcg
4117 * will never be freed, so it's safe to call css_get(). 4112 * will never be freed, so it's safe to call css_get().
4118 */ 4113 */
4119 memcg_check_events(memcg, page); 4114 memcg_check_events(memcg, page);
4120 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 4115 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
4121 mem_cgroup_swap_statistics(memcg, true); 4116 mem_cgroup_swap_statistics(memcg, true);
4122 css_get(&memcg->css); 4117 css_get(&memcg->css);
4123 } 4118 }
4124 /* 4119 /*
4125 * Migration does not charge the res_counter for the 4120 * Migration does not charge the res_counter for the
4126 * replacement page, so leave it alone when phasing out the 4121 * replacement page, so leave it alone when phasing out the
4127 * page that is unused after the migration. 4122 * page that is unused after the migration.
4128 */ 4123 */
4129 if (!end_migration && !mem_cgroup_is_root(memcg)) 4124 if (!end_migration && !mem_cgroup_is_root(memcg))
4130 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 4125 mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
4131 4126
4132 return memcg; 4127 return memcg;
4133 4128
4134 unlock_out: 4129 unlock_out:
4135 unlock_page_cgroup(pc); 4130 unlock_page_cgroup(pc);
4136 return NULL; 4131 return NULL;
4137 } 4132 }
4138 4133
4139 void mem_cgroup_uncharge_page(struct page *page) 4134 void mem_cgroup_uncharge_page(struct page *page)
4140 { 4135 {
4141 /* early check. */ 4136 /* early check. */
4142 if (page_mapped(page)) 4137 if (page_mapped(page))
4143 return; 4138 return;
4144 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); 4139 VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
4145 /* 4140 /*
4146 * If the page is in swap cache, uncharge should be deferred 4141 * If the page is in swap cache, uncharge should be deferred
4147 * to the swap path, which also properly accounts swap usage 4142 * to the swap path, which also properly accounts swap usage
4148 * and handles memcg lifetime. 4143 * and handles memcg lifetime.
4149 * 4144 *
4150 * Note that this check is not stable and reclaim may add the 4145 * Note that this check is not stable and reclaim may add the
4151 * page to swap cache at any time after this. However, if the 4146 * page to swap cache at any time after this. However, if the
4152 * page is not in swap cache by the time page->mapcount hits 4147 * page is not in swap cache by the time page->mapcount hits
4153 * 0, there won't be any page table references to the swap 4148 * 0, there won't be any page table references to the swap
4154 * slot, and reclaim will free it and not actually write the 4149 * slot, and reclaim will free it and not actually write the
4155 * page to disk. 4150 * page to disk.
4156 */ 4151 */
4157 if (PageSwapCache(page)) 4152 if (PageSwapCache(page))
4158 return; 4153 return;
4159 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); 4154 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
4160 } 4155 }
4161 4156
4162 void mem_cgroup_uncharge_cache_page(struct page *page) 4157 void mem_cgroup_uncharge_cache_page(struct page *page)
4163 { 4158 {
4164 VM_BUG_ON_PAGE(page_mapped(page), page); 4159 VM_BUG_ON_PAGE(page_mapped(page), page);
4165 VM_BUG_ON_PAGE(page->mapping, page); 4160 VM_BUG_ON_PAGE(page->mapping, page);
4166 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); 4161 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
4167 } 4162 }
4168 4163
4169 /* 4164 /*
4170 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 4165 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
4171 * In that cases, pages are freed continuously and we can expect pages 4166 * In that cases, pages are freed continuously and we can expect pages
4172 * are in the same memcg. All these calls itself limits the number of 4167 * are in the same memcg. All these calls itself limits the number of
4173 * pages freed at once, then uncharge_start/end() is called properly. 4168 * pages freed at once, then uncharge_start/end() is called properly.
4174 * This may be called prural(2) times in a context, 4169 * This may be called prural(2) times in a context,
4175 */ 4170 */
4176 4171
4177 void mem_cgroup_uncharge_start(void) 4172 void mem_cgroup_uncharge_start(void)
4178 { 4173 {
4179 current->memcg_batch.do_batch++; 4174 current->memcg_batch.do_batch++;
4180 /* We can do nest. */ 4175 /* We can do nest. */
4181 if (current->memcg_batch.do_batch == 1) { 4176 if (current->memcg_batch.do_batch == 1) {
4182 current->memcg_batch.memcg = NULL; 4177 current->memcg_batch.memcg = NULL;
4183 current->memcg_batch.nr_pages = 0; 4178 current->memcg_batch.nr_pages = 0;
4184 current->memcg_batch.memsw_nr_pages = 0; 4179 current->memcg_batch.memsw_nr_pages = 0;
4185 } 4180 }
4186 } 4181 }
4187 4182
4188 void mem_cgroup_uncharge_end(void) 4183 void mem_cgroup_uncharge_end(void)
4189 { 4184 {
4190 struct memcg_batch_info *batch = &current->memcg_batch; 4185 struct memcg_batch_info *batch = &current->memcg_batch;
4191 4186
4192 if (!batch->do_batch) 4187 if (!batch->do_batch)
4193 return; 4188 return;
4194 4189
4195 batch->do_batch--; 4190 batch->do_batch--;
4196 if (batch->do_batch) /* If stacked, do nothing. */ 4191 if (batch->do_batch) /* If stacked, do nothing. */
4197 return; 4192 return;
4198 4193
4199 if (!batch->memcg) 4194 if (!batch->memcg)
4200 return; 4195 return;
4201 /* 4196 /*
4202 * This "batch->memcg" is valid without any css_get/put etc... 4197 * This "batch->memcg" is valid without any css_get/put etc...
4203 * bacause we hide charges behind us. 4198 * bacause we hide charges behind us.
4204 */ 4199 */
4205 if (batch->nr_pages) 4200 if (batch->nr_pages)
4206 res_counter_uncharge(&batch->memcg->res, 4201 res_counter_uncharge(&batch->memcg->res,
4207 batch->nr_pages * PAGE_SIZE); 4202 batch->nr_pages * PAGE_SIZE);
4208 if (batch->memsw_nr_pages) 4203 if (batch->memsw_nr_pages)
4209 res_counter_uncharge(&batch->memcg->memsw, 4204 res_counter_uncharge(&batch->memcg->memsw,
4210 batch->memsw_nr_pages * PAGE_SIZE); 4205 batch->memsw_nr_pages * PAGE_SIZE);
4211 memcg_oom_recover(batch->memcg); 4206 memcg_oom_recover(batch->memcg);
4212 /* forget this pointer (for sanity check) */ 4207 /* forget this pointer (for sanity check) */
4213 batch->memcg = NULL; 4208 batch->memcg = NULL;
4214 } 4209 }
4215 4210
4216 #ifdef CONFIG_SWAP 4211 #ifdef CONFIG_SWAP
4217 /* 4212 /*
4218 * called after __delete_from_swap_cache() and drop "page" account. 4213 * called after __delete_from_swap_cache() and drop "page" account.
4219 * memcg information is recorded to swap_cgroup of "ent" 4214 * memcg information is recorded to swap_cgroup of "ent"
4220 */ 4215 */
4221 void 4216 void
4222 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 4217 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
4223 { 4218 {
4224 struct mem_cgroup *memcg; 4219 struct mem_cgroup *memcg;
4225 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 4220 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
4226 4221
4227 if (!swapout) /* this was a swap cache but the swap is unused ! */ 4222 if (!swapout) /* this was a swap cache but the swap is unused ! */
4228 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 4223 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
4229 4224
4230 memcg = __mem_cgroup_uncharge_common(page, ctype, false); 4225 memcg = __mem_cgroup_uncharge_common(page, ctype, false);
4231 4226
4232 /* 4227 /*
4233 * record memcg information, if swapout && memcg != NULL, 4228 * record memcg information, if swapout && memcg != NULL,
4234 * css_get() was called in uncharge(). 4229 * css_get() was called in uncharge().
4235 */ 4230 */
4236 if (do_swap_account && swapout && memcg) 4231 if (do_swap_account && swapout && memcg)
4237 swap_cgroup_record(ent, mem_cgroup_id(memcg)); 4232 swap_cgroup_record(ent, mem_cgroup_id(memcg));
4238 } 4233 }
4239 #endif 4234 #endif
4240 4235
4241 #ifdef CONFIG_MEMCG_SWAP 4236 #ifdef CONFIG_MEMCG_SWAP
4242 /* 4237 /*
4243 * called from swap_entry_free(). remove record in swap_cgroup and 4238 * called from swap_entry_free(). remove record in swap_cgroup and
4244 * uncharge "memsw" account. 4239 * uncharge "memsw" account.
4245 */ 4240 */
4246 void mem_cgroup_uncharge_swap(swp_entry_t ent) 4241 void mem_cgroup_uncharge_swap(swp_entry_t ent)
4247 { 4242 {
4248 struct mem_cgroup *memcg; 4243 struct mem_cgroup *memcg;
4249 unsigned short id; 4244 unsigned short id;
4250 4245
4251 if (!do_swap_account) 4246 if (!do_swap_account)
4252 return; 4247 return;
4253 4248
4254 id = swap_cgroup_record(ent, 0); 4249 id = swap_cgroup_record(ent, 0);
4255 rcu_read_lock(); 4250 rcu_read_lock();
4256 memcg = mem_cgroup_lookup(id); 4251 memcg = mem_cgroup_lookup(id);
4257 if (memcg) { 4252 if (memcg) {
4258 /* 4253 /*
4259 * We uncharge this because swap is freed. 4254 * We uncharge this because swap is freed.
4260 * This memcg can be obsolete one. We avoid calling css_tryget 4255 * This memcg can be obsolete one. We avoid calling css_tryget
4261 */ 4256 */
4262 if (!mem_cgroup_is_root(memcg)) 4257 if (!mem_cgroup_is_root(memcg))
4263 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4258 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
4264 mem_cgroup_swap_statistics(memcg, false); 4259 mem_cgroup_swap_statistics(memcg, false);
4265 css_put(&memcg->css); 4260 css_put(&memcg->css);
4266 } 4261 }
4267 rcu_read_unlock(); 4262 rcu_read_unlock();
4268 } 4263 }
4269 4264
4270 /** 4265 /**
4271 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 4266 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
4272 * @entry: swap entry to be moved 4267 * @entry: swap entry to be moved
4273 * @from: mem_cgroup which the entry is moved from 4268 * @from: mem_cgroup which the entry is moved from
4274 * @to: mem_cgroup which the entry is moved to 4269 * @to: mem_cgroup which the entry is moved to
4275 * 4270 *
4276 * It succeeds only when the swap_cgroup's record for this entry is the same 4271 * It succeeds only when the swap_cgroup's record for this entry is the same
4277 * as the mem_cgroup's id of @from. 4272 * as the mem_cgroup's id of @from.
4278 * 4273 *
4279 * Returns 0 on success, -EINVAL on failure. 4274 * Returns 0 on success, -EINVAL on failure.
4280 * 4275 *
4281 * The caller must have charged to @to, IOW, called res_counter_charge() about 4276 * The caller must have charged to @to, IOW, called res_counter_charge() about
4282 * both res and memsw, and called css_get(). 4277 * both res and memsw, and called css_get().
4283 */ 4278 */
4284 static int mem_cgroup_move_swap_account(swp_entry_t entry, 4279 static int mem_cgroup_move_swap_account(swp_entry_t entry,
4285 struct mem_cgroup *from, struct mem_cgroup *to) 4280 struct mem_cgroup *from, struct mem_cgroup *to)
4286 { 4281 {
4287 unsigned short old_id, new_id; 4282 unsigned short old_id, new_id;
4288 4283
4289 old_id = mem_cgroup_id(from); 4284 old_id = mem_cgroup_id(from);
4290 new_id = mem_cgroup_id(to); 4285 new_id = mem_cgroup_id(to);
4291 4286
4292 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 4287 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
4293 mem_cgroup_swap_statistics(from, false); 4288 mem_cgroup_swap_statistics(from, false);
4294 mem_cgroup_swap_statistics(to, true); 4289 mem_cgroup_swap_statistics(to, true);
4295 /* 4290 /*
4296 * This function is only called from task migration context now. 4291 * This function is only called from task migration context now.
4297 * It postpones res_counter and refcount handling till the end 4292 * It postpones res_counter and refcount handling till the end
4298 * of task migration(mem_cgroup_clear_mc()) for performance 4293 * of task migration(mem_cgroup_clear_mc()) for performance
4299 * improvement. But we cannot postpone css_get(to) because if 4294 * improvement. But we cannot postpone css_get(to) because if
4300 * the process that has been moved to @to does swap-in, the 4295 * the process that has been moved to @to does swap-in, the
4301 * refcount of @to might be decreased to 0. 4296 * refcount of @to might be decreased to 0.
4302 * 4297 *
4303 * We are in attach() phase, so the cgroup is guaranteed to be 4298 * We are in attach() phase, so the cgroup is guaranteed to be
4304 * alive, so we can just call css_get(). 4299 * alive, so we can just call css_get().
4305 */ 4300 */
4306 css_get(&to->css); 4301 css_get(&to->css);
4307 return 0; 4302 return 0;
4308 } 4303 }
4309 return -EINVAL; 4304 return -EINVAL;
4310 } 4305 }
4311 #else 4306 #else
4312 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 4307 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
4313 struct mem_cgroup *from, struct mem_cgroup *to) 4308 struct mem_cgroup *from, struct mem_cgroup *to)
4314 { 4309 {
4315 return -EINVAL; 4310 return -EINVAL;
4316 } 4311 }
4317 #endif 4312 #endif
4318 4313
4319 /* 4314 /*
4320 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 4315 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
4321 * page belongs to. 4316 * page belongs to.
4322 */ 4317 */
4323 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 4318 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
4324 struct mem_cgroup **memcgp) 4319 struct mem_cgroup **memcgp)
4325 { 4320 {
4326 struct mem_cgroup *memcg = NULL; 4321 struct mem_cgroup *memcg = NULL;
4327 unsigned int nr_pages = 1; 4322 unsigned int nr_pages = 1;
4328 struct page_cgroup *pc; 4323 struct page_cgroup *pc;
4329 enum charge_type ctype; 4324 enum charge_type ctype;
4330 4325
4331 *memcgp = NULL; 4326 *memcgp = NULL;
4332 4327
4333 if (mem_cgroup_disabled()) 4328 if (mem_cgroup_disabled())
4334 return; 4329 return;
4335 4330
4336 if (PageTransHuge(page)) 4331 if (PageTransHuge(page))
4337 nr_pages <<= compound_order(page); 4332 nr_pages <<= compound_order(page);
4338 4333
4339 pc = lookup_page_cgroup(page); 4334 pc = lookup_page_cgroup(page);
4340 lock_page_cgroup(pc); 4335 lock_page_cgroup(pc);
4341 if (PageCgroupUsed(pc)) { 4336 if (PageCgroupUsed(pc)) {
4342 memcg = pc->mem_cgroup; 4337 memcg = pc->mem_cgroup;
4343 css_get(&memcg->css); 4338 css_get(&memcg->css);
4344 /* 4339 /*
4345 * At migrating an anonymous page, its mapcount goes down 4340 * At migrating an anonymous page, its mapcount goes down
4346 * to 0 and uncharge() will be called. But, even if it's fully 4341 * to 0 and uncharge() will be called. But, even if it's fully
4347 * unmapped, migration may fail and this page has to be 4342 * unmapped, migration may fail and this page has to be
4348 * charged again. We set MIGRATION flag here and delay uncharge 4343 * charged again. We set MIGRATION flag here and delay uncharge
4349 * until end_migration() is called 4344 * until end_migration() is called
4350 * 4345 *
4351 * Corner Case Thinking 4346 * Corner Case Thinking
4352 * A) 4347 * A)
4353 * When the old page was mapped as Anon and it's unmap-and-freed 4348 * When the old page was mapped as Anon and it's unmap-and-freed
4354 * while migration was ongoing. 4349 * while migration was ongoing.
4355 * If unmap finds the old page, uncharge() of it will be delayed 4350 * If unmap finds the old page, uncharge() of it will be delayed
4356 * until end_migration(). If unmap finds a new page, it's 4351 * until end_migration(). If unmap finds a new page, it's
4357 * uncharged when it make mapcount to be 1->0. If unmap code 4352 * uncharged when it make mapcount to be 1->0. If unmap code
4358 * finds swap_migration_entry, the new page will not be mapped 4353 * finds swap_migration_entry, the new page will not be mapped
4359 * and end_migration() will find it(mapcount==0). 4354 * and end_migration() will find it(mapcount==0).
4360 * 4355 *
4361 * B) 4356 * B)
4362 * When the old page was mapped but migraion fails, the kernel 4357 * When the old page was mapped but migraion fails, the kernel
4363 * remaps it. A charge for it is kept by MIGRATION flag even 4358 * remaps it. A charge for it is kept by MIGRATION flag even
4364 * if mapcount goes down to 0. We can do remap successfully 4359 * if mapcount goes down to 0. We can do remap successfully
4365 * without charging it again. 4360 * without charging it again.
4366 * 4361 *
4367 * C) 4362 * C)
4368 * The "old" page is under lock_page() until the end of 4363 * The "old" page is under lock_page() until the end of
4369 * migration, so, the old page itself will not be swapped-out. 4364 * migration, so, the old page itself will not be swapped-out.
4370 * If the new page is swapped out before end_migraton, our 4365 * If the new page is swapped out before end_migraton, our
4371 * hook to usual swap-out path will catch the event. 4366 * hook to usual swap-out path will catch the event.
4372 */ 4367 */
4373 if (PageAnon(page)) 4368 if (PageAnon(page))
4374 SetPageCgroupMigration(pc); 4369 SetPageCgroupMigration(pc);
4375 } 4370 }
4376 unlock_page_cgroup(pc); 4371 unlock_page_cgroup(pc);
4377 /* 4372 /*
4378 * If the page is not charged at this point, 4373 * If the page is not charged at this point,
4379 * we return here. 4374 * we return here.
4380 */ 4375 */
4381 if (!memcg) 4376 if (!memcg)
4382 return; 4377 return;
4383 4378
4384 *memcgp = memcg; 4379 *memcgp = memcg;
4385 /* 4380 /*
4386 * We charge new page before it's used/mapped. So, even if unlock_page() 4381 * We charge new page before it's used/mapped. So, even if unlock_page()
4387 * is called before end_migration, we can catch all events on this new 4382 * is called before end_migration, we can catch all events on this new
4388 * page. In the case new page is migrated but not remapped, new page's 4383 * page. In the case new page is migrated but not remapped, new page's
4389 * mapcount will be finally 0 and we call uncharge in end_migration(). 4384 * mapcount will be finally 0 and we call uncharge in end_migration().
4390 */ 4385 */
4391 if (PageAnon(page)) 4386 if (PageAnon(page))
4392 ctype = MEM_CGROUP_CHARGE_TYPE_ANON; 4387 ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
4393 else 4388 else
4394 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 4389 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
4395 /* 4390 /*
4396 * The page is committed to the memcg, but it's not actually 4391 * The page is committed to the memcg, but it's not actually
4397 * charged to the res_counter since we plan on replacing the 4392 * charged to the res_counter since we plan on replacing the
4398 * old one and only one page is going to be left afterwards. 4393 * old one and only one page is going to be left afterwards.
4399 */ 4394 */
4400 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); 4395 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
4401 } 4396 }
4402 4397
4403 /* remove redundant charge if migration failed*/ 4398 /* remove redundant charge if migration failed*/
4404 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 4399 void mem_cgroup_end_migration(struct mem_cgroup *memcg,
4405 struct page *oldpage, struct page *newpage, bool migration_ok) 4400 struct page *oldpage, struct page *newpage, bool migration_ok)
4406 { 4401 {
4407 struct page *used, *unused; 4402 struct page *used, *unused;
4408 struct page_cgroup *pc; 4403 struct page_cgroup *pc;
4409 bool anon; 4404 bool anon;
4410 4405
4411 if (!memcg) 4406 if (!memcg)
4412 return; 4407 return;
4413 4408
4414 if (!migration_ok) { 4409 if (!migration_ok) {
4415 used = oldpage; 4410 used = oldpage;
4416 unused = newpage; 4411 unused = newpage;
4417 } else { 4412 } else {
4418 used = newpage; 4413 used = newpage;
4419 unused = oldpage; 4414 unused = oldpage;
4420 } 4415 }
4421 anon = PageAnon(used); 4416 anon = PageAnon(used);
4422 __mem_cgroup_uncharge_common(unused, 4417 __mem_cgroup_uncharge_common(unused,
4423 anon ? MEM_CGROUP_CHARGE_TYPE_ANON 4418 anon ? MEM_CGROUP_CHARGE_TYPE_ANON
4424 : MEM_CGROUP_CHARGE_TYPE_CACHE, 4419 : MEM_CGROUP_CHARGE_TYPE_CACHE,
4425 true); 4420 true);
4426 css_put(&memcg->css); 4421 css_put(&memcg->css);
4427 /* 4422 /*
4428 * We disallowed uncharge of pages under migration because mapcount 4423 * We disallowed uncharge of pages under migration because mapcount
4429 * of the page goes down to zero, temporarly. 4424 * of the page goes down to zero, temporarly.
4430 * Clear the flag and check the page should be charged. 4425 * Clear the flag and check the page should be charged.
4431 */ 4426 */
4432 pc = lookup_page_cgroup(oldpage); 4427 pc = lookup_page_cgroup(oldpage);
4433 lock_page_cgroup(pc); 4428 lock_page_cgroup(pc);
4434 ClearPageCgroupMigration(pc); 4429 ClearPageCgroupMigration(pc);
4435 unlock_page_cgroup(pc); 4430 unlock_page_cgroup(pc);
4436 4431
4437 /* 4432 /*
4438 * If a page is a file cache, radix-tree replacement is very atomic 4433 * If a page is a file cache, radix-tree replacement is very atomic
4439 * and we can skip this check. When it was an Anon page, its mapcount 4434 * and we can skip this check. When it was an Anon page, its mapcount
4440 * goes down to 0. But because we added MIGRATION flage, it's not 4435 * goes down to 0. But because we added MIGRATION flage, it's not
4441 * uncharged yet. There are several case but page->mapcount check 4436 * uncharged yet. There are several case but page->mapcount check
4442 * and USED bit check in mem_cgroup_uncharge_page() will do enough 4437 * and USED bit check in mem_cgroup_uncharge_page() will do enough
4443 * check. (see prepare_charge() also) 4438 * check. (see prepare_charge() also)
4444 */ 4439 */
4445 if (anon) 4440 if (anon)
4446 mem_cgroup_uncharge_page(used); 4441 mem_cgroup_uncharge_page(used);
4447 } 4442 }
4448 4443
4449 /* 4444 /*
4450 * At replace page cache, newpage is not under any memcg but it's on 4445 * At replace page cache, newpage is not under any memcg but it's on
4451 * LRU. So, this function doesn't touch res_counter but handles LRU 4446 * LRU. So, this function doesn't touch res_counter but handles LRU
4452 * in correct way. Both pages are locked so we cannot race with uncharge. 4447 * in correct way. Both pages are locked so we cannot race with uncharge.
4453 */ 4448 */
4454 void mem_cgroup_replace_page_cache(struct page *oldpage, 4449 void mem_cgroup_replace_page_cache(struct page *oldpage,
4455 struct page *newpage) 4450 struct page *newpage)
4456 { 4451 {
4457 struct mem_cgroup *memcg = NULL; 4452 struct mem_cgroup *memcg = NULL;
4458 struct page_cgroup *pc; 4453 struct page_cgroup *pc;
4459 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4454 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
4460 4455
4461 if (mem_cgroup_disabled()) 4456 if (mem_cgroup_disabled())
4462 return; 4457 return;
4463 4458
4464 pc = lookup_page_cgroup(oldpage); 4459 pc = lookup_page_cgroup(oldpage);
4465 /* fix accounting on old pages */ 4460 /* fix accounting on old pages */
4466 lock_page_cgroup(pc); 4461 lock_page_cgroup(pc);
4467 if (PageCgroupUsed(pc)) { 4462 if (PageCgroupUsed(pc)) {
4468 memcg = pc->mem_cgroup; 4463 memcg = pc->mem_cgroup;
4469 mem_cgroup_charge_statistics(memcg, oldpage, false, -1); 4464 mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
4470 ClearPageCgroupUsed(pc); 4465 ClearPageCgroupUsed(pc);
4471 } 4466 }
4472 unlock_page_cgroup(pc); 4467 unlock_page_cgroup(pc);
4473 4468
4474 /* 4469 /*
4475 * When called from shmem_replace_page(), in some cases the 4470 * When called from shmem_replace_page(), in some cases the
4476 * oldpage has already been charged, and in some cases not. 4471 * oldpage has already been charged, and in some cases not.
4477 */ 4472 */
4478 if (!memcg) 4473 if (!memcg)
4479 return; 4474 return;
4480 /* 4475 /*
4481 * Even if newpage->mapping was NULL before starting replacement, 4476 * Even if newpage->mapping was NULL before starting replacement,
4482 * the newpage may be on LRU(or pagevec for LRU) already. We lock 4477 * the newpage may be on LRU(or pagevec for LRU) already. We lock
4483 * LRU while we overwrite pc->mem_cgroup. 4478 * LRU while we overwrite pc->mem_cgroup.
4484 */ 4479 */
4485 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 4480 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
4486 } 4481 }
4487 4482
4488 #ifdef CONFIG_DEBUG_VM 4483 #ifdef CONFIG_DEBUG_VM
4489 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 4484 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
4490 { 4485 {
4491 struct page_cgroup *pc; 4486 struct page_cgroup *pc;
4492 4487
4493 pc = lookup_page_cgroup(page); 4488 pc = lookup_page_cgroup(page);
4494 /* 4489 /*
4495 * Can be NULL while feeding pages into the page allocator for 4490 * Can be NULL while feeding pages into the page allocator for
4496 * the first time, i.e. during boot or memory hotplug; 4491 * the first time, i.e. during boot or memory hotplug;
4497 * or when mem_cgroup_disabled(). 4492 * or when mem_cgroup_disabled().
4498 */ 4493 */
4499 if (likely(pc) && PageCgroupUsed(pc)) 4494 if (likely(pc) && PageCgroupUsed(pc))
4500 return pc; 4495 return pc;
4501 return NULL; 4496 return NULL;
4502 } 4497 }
4503 4498
4504 bool mem_cgroup_bad_page_check(struct page *page) 4499 bool mem_cgroup_bad_page_check(struct page *page)
4505 { 4500 {
4506 if (mem_cgroup_disabled()) 4501 if (mem_cgroup_disabled())
4507 return false; 4502 return false;
4508 4503
4509 return lookup_page_cgroup_used(page) != NULL; 4504 return lookup_page_cgroup_used(page) != NULL;
4510 } 4505 }
4511 4506
4512 void mem_cgroup_print_bad_page(struct page *page) 4507 void mem_cgroup_print_bad_page(struct page *page)
4513 { 4508 {
4514 struct page_cgroup *pc; 4509 struct page_cgroup *pc;
4515 4510
4516 pc = lookup_page_cgroup_used(page); 4511 pc = lookup_page_cgroup_used(page);
4517 if (pc) { 4512 if (pc) {
4518 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4513 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n",
4519 pc, pc->flags, pc->mem_cgroup); 4514 pc, pc->flags, pc->mem_cgroup);
4520 } 4515 }
4521 } 4516 }
4522 #endif 4517 #endif
4523 4518
4524 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4519 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
4525 unsigned long long val) 4520 unsigned long long val)
4526 { 4521 {
4527 int retry_count; 4522 int retry_count;
4528 u64 memswlimit, memlimit; 4523 u64 memswlimit, memlimit;
4529 int ret = 0; 4524 int ret = 0;
4530 int children = mem_cgroup_count_children(memcg); 4525 int children = mem_cgroup_count_children(memcg);
4531 u64 curusage, oldusage; 4526 u64 curusage, oldusage;
4532 int enlarge; 4527 int enlarge;
4533 4528
4534 /* 4529 /*
4535 * For keeping hierarchical_reclaim simple, how long we should retry 4530 * For keeping hierarchical_reclaim simple, how long we should retry
4536 * is depends on callers. We set our retry-count to be function 4531 * is depends on callers. We set our retry-count to be function
4537 * of # of children which we should visit in this loop. 4532 * of # of children which we should visit in this loop.
4538 */ 4533 */
4539 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 4534 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
4540 4535
4541 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4536 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4542 4537
4543 enlarge = 0; 4538 enlarge = 0;
4544 while (retry_count) { 4539 while (retry_count) {
4545 if (signal_pending(current)) { 4540 if (signal_pending(current)) {
4546 ret = -EINTR; 4541 ret = -EINTR;
4547 break; 4542 break;
4548 } 4543 }
4549 /* 4544 /*
4550 * Rather than hide all in some function, I do this in 4545 * Rather than hide all in some function, I do this in
4551 * open coded manner. You see what this really does. 4546 * open coded manner. You see what this really does.
4552 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4547 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
4553 */ 4548 */
4554 mutex_lock(&set_limit_mutex); 4549 mutex_lock(&set_limit_mutex);
4555 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4550 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4556 if (memswlimit < val) { 4551 if (memswlimit < val) {
4557 ret = -EINVAL; 4552 ret = -EINVAL;
4558 mutex_unlock(&set_limit_mutex); 4553 mutex_unlock(&set_limit_mutex);
4559 break; 4554 break;
4560 } 4555 }
4561 4556
4562 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4557 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4563 if (memlimit < val) 4558 if (memlimit < val)
4564 enlarge = 1; 4559 enlarge = 1;
4565 4560
4566 ret = res_counter_set_limit(&memcg->res, val); 4561 ret = res_counter_set_limit(&memcg->res, val);
4567 if (!ret) { 4562 if (!ret) {
4568 if (memswlimit == val) 4563 if (memswlimit == val)
4569 memcg->memsw_is_minimum = true; 4564 memcg->memsw_is_minimum = true;
4570 else 4565 else
4571 memcg->memsw_is_minimum = false; 4566 memcg->memsw_is_minimum = false;
4572 } 4567 }
4573 mutex_unlock(&set_limit_mutex); 4568 mutex_unlock(&set_limit_mutex);
4574 4569
4575 if (!ret) 4570 if (!ret)
4576 break; 4571 break;
4577 4572
4578 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4573 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4579 MEM_CGROUP_RECLAIM_SHRINK); 4574 MEM_CGROUP_RECLAIM_SHRINK);
4580 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4575 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
4581 /* Usage is reduced ? */ 4576 /* Usage is reduced ? */
4582 if (curusage >= oldusage) 4577 if (curusage >= oldusage)
4583 retry_count--; 4578 retry_count--;
4584 else 4579 else
4585 oldusage = curusage; 4580 oldusage = curusage;
4586 } 4581 }
4587 if (!ret && enlarge) 4582 if (!ret && enlarge)
4588 memcg_oom_recover(memcg); 4583 memcg_oom_recover(memcg);
4589 4584
4590 return ret; 4585 return ret;
4591 } 4586 }
4592 4587
4593 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 4588 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
4594 unsigned long long val) 4589 unsigned long long val)
4595 { 4590 {
4596 int retry_count; 4591 int retry_count;
4597 u64 memlimit, memswlimit, oldusage, curusage; 4592 u64 memlimit, memswlimit, oldusage, curusage;
4598 int children = mem_cgroup_count_children(memcg); 4593 int children = mem_cgroup_count_children(memcg);
4599 int ret = -EBUSY; 4594 int ret = -EBUSY;
4600 int enlarge = 0; 4595 int enlarge = 0;
4601 4596
4602 /* see mem_cgroup_resize_res_limit */ 4597 /* see mem_cgroup_resize_res_limit */
4603 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4598 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
4604 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4599 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4605 while (retry_count) { 4600 while (retry_count) {
4606 if (signal_pending(current)) { 4601 if (signal_pending(current)) {
4607 ret = -EINTR; 4602 ret = -EINTR;
4608 break; 4603 break;
4609 } 4604 }
4610 /* 4605 /*
4611 * Rather than hide all in some function, I do this in 4606 * Rather than hide all in some function, I do this in
4612 * open coded manner. You see what this really does. 4607 * open coded manner. You see what this really does.
4613 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4608 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
4614 */ 4609 */
4615 mutex_lock(&set_limit_mutex); 4610 mutex_lock(&set_limit_mutex);
4616 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4611 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4617 if (memlimit > val) { 4612 if (memlimit > val) {
4618 ret = -EINVAL; 4613 ret = -EINVAL;
4619 mutex_unlock(&set_limit_mutex); 4614 mutex_unlock(&set_limit_mutex);
4620 break; 4615 break;
4621 } 4616 }
4622 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4617 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4623 if (memswlimit < val) 4618 if (memswlimit < val)
4624 enlarge = 1; 4619 enlarge = 1;
4625 ret = res_counter_set_limit(&memcg->memsw, val); 4620 ret = res_counter_set_limit(&memcg->memsw, val);
4626 if (!ret) { 4621 if (!ret) {
4627 if (memlimit == val) 4622 if (memlimit == val)
4628 memcg->memsw_is_minimum = true; 4623 memcg->memsw_is_minimum = true;
4629 else 4624 else
4630 memcg->memsw_is_minimum = false; 4625 memcg->memsw_is_minimum = false;
4631 } 4626 }
4632 mutex_unlock(&set_limit_mutex); 4627 mutex_unlock(&set_limit_mutex);
4633 4628
4634 if (!ret) 4629 if (!ret)
4635 break; 4630 break;
4636 4631
4637 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4632 mem_cgroup_reclaim(memcg, GFP_KERNEL,
4638 MEM_CGROUP_RECLAIM_NOSWAP | 4633 MEM_CGROUP_RECLAIM_NOSWAP |
4639 MEM_CGROUP_RECLAIM_SHRINK); 4634 MEM_CGROUP_RECLAIM_SHRINK);
4640 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4635 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
4641 /* Usage is reduced ? */ 4636 /* Usage is reduced ? */
4642 if (curusage >= oldusage) 4637 if (curusage >= oldusage)
4643 retry_count--; 4638 retry_count--;
4644 else 4639 else
4645 oldusage = curusage; 4640 oldusage = curusage;
4646 } 4641 }
4647 if (!ret && enlarge) 4642 if (!ret && enlarge)
4648 memcg_oom_recover(memcg); 4643 memcg_oom_recover(memcg);
4649 return ret; 4644 return ret;
4650 } 4645 }
4651 4646
4652 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 4647 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
4653 gfp_t gfp_mask, 4648 gfp_t gfp_mask,
4654 unsigned long *total_scanned) 4649 unsigned long *total_scanned)
4655 { 4650 {
4656 unsigned long nr_reclaimed = 0; 4651 unsigned long nr_reclaimed = 0;
4657 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 4652 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
4658 unsigned long reclaimed; 4653 unsigned long reclaimed;
4659 int loop = 0; 4654 int loop = 0;
4660 struct mem_cgroup_tree_per_zone *mctz; 4655 struct mem_cgroup_tree_per_zone *mctz;
4661 unsigned long long excess; 4656 unsigned long long excess;
4662 unsigned long nr_scanned; 4657 unsigned long nr_scanned;
4663 4658
4664 if (order > 0) 4659 if (order > 0)
4665 return 0; 4660 return 0;
4666 4661
4667 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 4662 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
4668 /* 4663 /*
4669 * This loop can run a while, specially if mem_cgroup's continuously 4664 * This loop can run a while, specially if mem_cgroup's continuously
4670 * keep exceeding their soft limit and putting the system under 4665 * keep exceeding their soft limit and putting the system under
4671 * pressure 4666 * pressure
4672 */ 4667 */
4673 do { 4668 do {
4674 if (next_mz) 4669 if (next_mz)
4675 mz = next_mz; 4670 mz = next_mz;
4676 else 4671 else
4677 mz = mem_cgroup_largest_soft_limit_node(mctz); 4672 mz = mem_cgroup_largest_soft_limit_node(mctz);
4678 if (!mz) 4673 if (!mz)
4679 break; 4674 break;
4680 4675
4681 nr_scanned = 0; 4676 nr_scanned = 0;
4682 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 4677 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
4683 gfp_mask, &nr_scanned); 4678 gfp_mask, &nr_scanned);
4684 nr_reclaimed += reclaimed; 4679 nr_reclaimed += reclaimed;
4685 *total_scanned += nr_scanned; 4680 *total_scanned += nr_scanned;
4686 spin_lock(&mctz->lock); 4681 spin_lock(&mctz->lock);
4687 4682
4688 /* 4683 /*
4689 * If we failed to reclaim anything from this memory cgroup 4684 * If we failed to reclaim anything from this memory cgroup
4690 * it is time to move on to the next cgroup 4685 * it is time to move on to the next cgroup
4691 */ 4686 */
4692 next_mz = NULL; 4687 next_mz = NULL;
4693 if (!reclaimed) { 4688 if (!reclaimed) {
4694 do { 4689 do {
4695 /* 4690 /*
4696 * Loop until we find yet another one. 4691 * Loop until we find yet another one.
4697 * 4692 *
4698 * By the time we get the soft_limit lock 4693 * By the time we get the soft_limit lock
4699 * again, someone might have aded the 4694 * again, someone might have aded the
4700 * group back on the RB tree. Iterate to 4695 * group back on the RB tree. Iterate to
4701 * make sure we get a different mem. 4696 * make sure we get a different mem.
4702 * mem_cgroup_largest_soft_limit_node returns 4697 * mem_cgroup_largest_soft_limit_node returns
4703 * NULL if no other cgroup is present on 4698 * NULL if no other cgroup is present on
4704 * the tree 4699 * the tree
4705 */ 4700 */
4706 next_mz = 4701 next_mz =
4707 __mem_cgroup_largest_soft_limit_node(mctz); 4702 __mem_cgroup_largest_soft_limit_node(mctz);
4708 if (next_mz == mz) 4703 if (next_mz == mz)
4709 css_put(&next_mz->memcg->css); 4704 css_put(&next_mz->memcg->css);
4710 else /* next_mz == NULL or other memcg */ 4705 else /* next_mz == NULL or other memcg */
4711 break; 4706 break;
4712 } while (1); 4707 } while (1);
4713 } 4708 }
4714 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 4709 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz);
4715 excess = res_counter_soft_limit_excess(&mz->memcg->res); 4710 excess = res_counter_soft_limit_excess(&mz->memcg->res);
4716 /* 4711 /*
4717 * One school of thought says that we should not add 4712 * One school of thought says that we should not add
4718 * back the node to the tree if reclaim returns 0. 4713 * back the node to the tree if reclaim returns 0.
4719 * But our reclaim could return 0, simply because due 4714 * But our reclaim could return 0, simply because due
4720 * to priority we are exposing a smaller subset of 4715 * to priority we are exposing a smaller subset of
4721 * memory to reclaim from. Consider this as a longer 4716 * memory to reclaim from. Consider this as a longer
4722 * term TODO. 4717 * term TODO.
4723 */ 4718 */
4724 /* If excess == 0, no tree ops */ 4719 /* If excess == 0, no tree ops */
4725 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 4720 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
4726 spin_unlock(&mctz->lock); 4721 spin_unlock(&mctz->lock);
4727 css_put(&mz->memcg->css); 4722 css_put(&mz->memcg->css);
4728 loop++; 4723 loop++;
4729 /* 4724 /*
4730 * Could not reclaim anything and there are no more 4725 * Could not reclaim anything and there are no more
4731 * mem cgroups to try or we seem to be looping without 4726 * mem cgroups to try or we seem to be looping without
4732 * reclaiming anything. 4727 * reclaiming anything.
4733 */ 4728 */
4734 if (!nr_reclaimed && 4729 if (!nr_reclaimed &&
4735 (next_mz == NULL || 4730 (next_mz == NULL ||
4736 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 4731 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
4737 break; 4732 break;
4738 } while (!nr_reclaimed); 4733 } while (!nr_reclaimed);
4739 if (next_mz) 4734 if (next_mz)
4740 css_put(&next_mz->memcg->css); 4735 css_put(&next_mz->memcg->css);
4741 return nr_reclaimed; 4736 return nr_reclaimed;
4742 } 4737 }
4743 4738
4744 /** 4739 /**
4745 * mem_cgroup_force_empty_list - clears LRU of a group 4740 * mem_cgroup_force_empty_list - clears LRU of a group
4746 * @memcg: group to clear 4741 * @memcg: group to clear
4747 * @node: NUMA node 4742 * @node: NUMA node
4748 * @zid: zone id 4743 * @zid: zone id
4749 * @lru: lru to to clear 4744 * @lru: lru to to clear
4750 * 4745 *
4751 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4746 * Traverse a specified page_cgroup list and try to drop them all. This doesn't
4752 * reclaim the pages page themselves - pages are moved to the parent (or root) 4747 * reclaim the pages page themselves - pages are moved to the parent (or root)
4753 * group. 4748 * group.
4754 */ 4749 */
4755 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4750 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
4756 int node, int zid, enum lru_list lru) 4751 int node, int zid, enum lru_list lru)
4757 { 4752 {
4758 struct lruvec *lruvec; 4753 struct lruvec *lruvec;
4759 unsigned long flags; 4754 unsigned long flags;
4760 struct list_head *list; 4755 struct list_head *list;
4761 struct page *busy; 4756 struct page *busy;
4762 struct zone *zone; 4757 struct zone *zone;
4763 4758
4764 zone = &NODE_DATA(node)->node_zones[zid]; 4759 zone = &NODE_DATA(node)->node_zones[zid];
4765 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4760 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
4766 list = &lruvec->lists[lru]; 4761 list = &lruvec->lists[lru];
4767 4762
4768 busy = NULL; 4763 busy = NULL;
4769 do { 4764 do {
4770 struct page_cgroup *pc; 4765 struct page_cgroup *pc;
4771 struct page *page; 4766 struct page *page;
4772 4767
4773 spin_lock_irqsave(&zone->lru_lock, flags); 4768 spin_lock_irqsave(&zone->lru_lock, flags);
4774 if (list_empty(list)) { 4769 if (list_empty(list)) {
4775 spin_unlock_irqrestore(&zone->lru_lock, flags); 4770 spin_unlock_irqrestore(&zone->lru_lock, flags);
4776 break; 4771 break;
4777 } 4772 }
4778 page = list_entry(list->prev, struct page, lru); 4773 page = list_entry(list->prev, struct page, lru);
4779 if (busy == page) { 4774 if (busy == page) {
4780 list_move(&page->lru, list); 4775 list_move(&page->lru, list);
4781 busy = NULL; 4776 busy = NULL;
4782 spin_unlock_irqrestore(&zone->lru_lock, flags); 4777 spin_unlock_irqrestore(&zone->lru_lock, flags);
4783 continue; 4778 continue;
4784 } 4779 }
4785 spin_unlock_irqrestore(&zone->lru_lock, flags); 4780 spin_unlock_irqrestore(&zone->lru_lock, flags);
4786 4781
4787 pc = lookup_page_cgroup(page); 4782 pc = lookup_page_cgroup(page);
4788 4783
4789 if (mem_cgroup_move_parent(page, pc, memcg)) { 4784 if (mem_cgroup_move_parent(page, pc, memcg)) {
4790 /* found lock contention or "pc" is obsolete. */ 4785 /* found lock contention or "pc" is obsolete. */
4791 busy = page; 4786 busy = page;
4792 cond_resched(); 4787 cond_resched();
4793 } else 4788 } else
4794 busy = NULL; 4789 busy = NULL;
4795 } while (!list_empty(list)); 4790 } while (!list_empty(list));
4796 } 4791 }
4797 4792
4798 /* 4793 /*
4799 * make mem_cgroup's charge to be 0 if there is no task by moving 4794 * make mem_cgroup's charge to be 0 if there is no task by moving
4800 * all the charges and pages to the parent. 4795 * all the charges and pages to the parent.
4801 * This enables deleting this mem_cgroup. 4796 * This enables deleting this mem_cgroup.
4802 * 4797 *
4803 * Caller is responsible for holding css reference on the memcg. 4798 * Caller is responsible for holding css reference on the memcg.
4804 */ 4799 */
4805 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 4800 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
4806 { 4801 {
4807 int node, zid; 4802 int node, zid;
4808 u64 usage; 4803 u64 usage;
4809 4804
4810 do { 4805 do {
4811 /* This is for making all *used* pages to be on LRU. */ 4806 /* This is for making all *used* pages to be on LRU. */
4812 lru_add_drain_all(); 4807 lru_add_drain_all();
4813 drain_all_stock_sync(memcg); 4808 drain_all_stock_sync(memcg);
4814 mem_cgroup_start_move(memcg); 4809 mem_cgroup_start_move(memcg);
4815 for_each_node_state(node, N_MEMORY) { 4810 for_each_node_state(node, N_MEMORY) {
4816 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4811 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4817 enum lru_list lru; 4812 enum lru_list lru;
4818 for_each_lru(lru) { 4813 for_each_lru(lru) {
4819 mem_cgroup_force_empty_list(memcg, 4814 mem_cgroup_force_empty_list(memcg,
4820 node, zid, lru); 4815 node, zid, lru);
4821 } 4816 }
4822 } 4817 }
4823 } 4818 }
4824 mem_cgroup_end_move(memcg); 4819 mem_cgroup_end_move(memcg);
4825 memcg_oom_recover(memcg); 4820 memcg_oom_recover(memcg);
4826 cond_resched(); 4821 cond_resched();
4827 4822
4828 /* 4823 /*
4829 * Kernel memory may not necessarily be trackable to a specific 4824 * Kernel memory may not necessarily be trackable to a specific
4830 * process. So they are not migrated, and therefore we can't 4825 * process. So they are not migrated, and therefore we can't
4831 * expect their value to drop to 0 here. 4826 * expect their value to drop to 0 here.
4832 * Having res filled up with kmem only is enough. 4827 * Having res filled up with kmem only is enough.
4833 * 4828 *
4834 * This is a safety check because mem_cgroup_force_empty_list 4829 * This is a safety check because mem_cgroup_force_empty_list
4835 * could have raced with mem_cgroup_replace_page_cache callers 4830 * could have raced with mem_cgroup_replace_page_cache callers
4836 * so the lru seemed empty but the page could have been added 4831 * so the lru seemed empty but the page could have been added
4837 * right after the check. RES_USAGE should be safe as we always 4832 * right after the check. RES_USAGE should be safe as we always
4838 * charge before adding to the LRU. 4833 * charge before adding to the LRU.
4839 */ 4834 */
4840 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 4835 usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
4841 res_counter_read_u64(&memcg->kmem, RES_USAGE); 4836 res_counter_read_u64(&memcg->kmem, RES_USAGE);
4842 } while (usage > 0); 4837 } while (usage > 0);
4843 } 4838 }
4844 4839
4845 static inline bool memcg_has_children(struct mem_cgroup *memcg) 4840 static inline bool memcg_has_children(struct mem_cgroup *memcg)
4846 { 4841 {
4847 lockdep_assert_held(&memcg_create_mutex); 4842 lockdep_assert_held(&memcg_create_mutex);
4848 /* 4843 /*
4849 * The lock does not prevent addition or deletion to the list 4844 * The lock does not prevent addition or deletion to the list
4850 * of children, but it prevents a new child from being 4845 * of children, but it prevents a new child from being
4851 * initialized based on this parent in css_online(), so it's 4846 * initialized based on this parent in css_online(), so it's
4852 * enough to decide whether hierarchically inherited 4847 * enough to decide whether hierarchically inherited
4853 * attributes can still be changed or not. 4848 * attributes can still be changed or not.
4854 */ 4849 */
4855 return memcg->use_hierarchy && 4850 return memcg->use_hierarchy &&
4856 !list_empty(&memcg->css.cgroup->children); 4851 !list_empty(&memcg->css.cgroup->children);
4857 } 4852 }
4858 4853
4859 /* 4854 /*
4860 * Reclaims as many pages from the given memcg as possible and moves 4855 * Reclaims as many pages from the given memcg as possible and moves
4861 * the rest to the parent. 4856 * the rest to the parent.
4862 * 4857 *
4863 * Caller is responsible for holding css reference for memcg. 4858 * Caller is responsible for holding css reference for memcg.
4864 */ 4859 */
4865 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 4860 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
4866 { 4861 {
4867 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 4862 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
4868 struct cgroup *cgrp = memcg->css.cgroup; 4863 struct cgroup *cgrp = memcg->css.cgroup;
4869 4864
4870 /* returns EBUSY if there is a task or if we come here twice. */ 4865 /* returns EBUSY if there is a task or if we come here twice. */
4871 if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children)) 4866 if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children))
4872 return -EBUSY; 4867 return -EBUSY;
4873 4868
4874 /* we call try-to-free pages for make this cgroup empty */ 4869 /* we call try-to-free pages for make this cgroup empty */
4875 lru_add_drain_all(); 4870 lru_add_drain_all();
4876 /* try to free all pages in this cgroup */ 4871 /* try to free all pages in this cgroup */
4877 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 4872 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
4878 int progress; 4873 int progress;
4879 4874
4880 if (signal_pending(current)) 4875 if (signal_pending(current))
4881 return -EINTR; 4876 return -EINTR;
4882 4877
4883 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 4878 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
4884 false); 4879 false);
4885 if (!progress) { 4880 if (!progress) {
4886 nr_retries--; 4881 nr_retries--;
4887 /* maybe some writeback is necessary */ 4882 /* maybe some writeback is necessary */
4888 congestion_wait(BLK_RW_ASYNC, HZ/10); 4883 congestion_wait(BLK_RW_ASYNC, HZ/10);
4889 } 4884 }
4890 4885
4891 } 4886 }
4892 lru_add_drain(); 4887 lru_add_drain();
4893 mem_cgroup_reparent_charges(memcg); 4888 mem_cgroup_reparent_charges(memcg);
4894 4889
4895 return 0; 4890 return 0;
4896 } 4891 }
4897 4892
4898 static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, 4893 static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css,
4899 unsigned int event) 4894 unsigned int event)
4900 { 4895 {
4901 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4896 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4902 4897
4903 if (mem_cgroup_is_root(memcg)) 4898 if (mem_cgroup_is_root(memcg))
4904 return -EINVAL; 4899 return -EINVAL;
4905 return mem_cgroup_force_empty(memcg); 4900 return mem_cgroup_force_empty(memcg);
4906 } 4901 }
4907 4902
4908 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 4903 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
4909 struct cftype *cft) 4904 struct cftype *cft)
4910 { 4905 {
4911 return mem_cgroup_from_css(css)->use_hierarchy; 4906 return mem_cgroup_from_css(css)->use_hierarchy;
4912 } 4907 }
4913 4908
4914 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 4909 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
4915 struct cftype *cft, u64 val) 4910 struct cftype *cft, u64 val)
4916 { 4911 {
4917 int retval = 0; 4912 int retval = 0;
4918 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4913 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4919 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 4914 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css));
4920 4915
4921 mutex_lock(&memcg_create_mutex); 4916 mutex_lock(&memcg_create_mutex);
4922 4917
4923 if (memcg->use_hierarchy == val) 4918 if (memcg->use_hierarchy == val)
4924 goto out; 4919 goto out;
4925 4920
4926 /* 4921 /*
4927 * If parent's use_hierarchy is set, we can't make any modifications 4922 * If parent's use_hierarchy is set, we can't make any modifications
4928 * in the child subtrees. If it is unset, then the change can 4923 * in the child subtrees. If it is unset, then the change can
4929 * occur, provided the current cgroup has no children. 4924 * occur, provided the current cgroup has no children.
4930 * 4925 *
4931 * For the root cgroup, parent_mem is NULL, we allow value to be 4926 * For the root cgroup, parent_mem is NULL, we allow value to be
4932 * set if there are no children. 4927 * set if there are no children.
4933 */ 4928 */
4934 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 4929 if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
4935 (val == 1 || val == 0)) { 4930 (val == 1 || val == 0)) {
4936 if (list_empty(&memcg->css.cgroup->children)) 4931 if (list_empty(&memcg->css.cgroup->children))
4937 memcg->use_hierarchy = val; 4932 memcg->use_hierarchy = val;
4938 else 4933 else
4939 retval = -EBUSY; 4934 retval = -EBUSY;
4940 } else 4935 } else
4941 retval = -EINVAL; 4936 retval = -EINVAL;
4942 4937
4943 out: 4938 out:
4944 mutex_unlock(&memcg_create_mutex); 4939 mutex_unlock(&memcg_create_mutex);
4945 4940
4946 return retval; 4941 return retval;
4947 } 4942 }
4948 4943
4949 4944
4950 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 4945 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
4951 enum mem_cgroup_stat_index idx) 4946 enum mem_cgroup_stat_index idx)
4952 { 4947 {
4953 struct mem_cgroup *iter; 4948 struct mem_cgroup *iter;
4954 long val = 0; 4949 long val = 0;
4955 4950
4956 /* Per-cpu values can be negative, use a signed accumulator */ 4951 /* Per-cpu values can be negative, use a signed accumulator */
4957 for_each_mem_cgroup_tree(iter, memcg) 4952 for_each_mem_cgroup_tree(iter, memcg)
4958 val += mem_cgroup_read_stat(iter, idx); 4953 val += mem_cgroup_read_stat(iter, idx);
4959 4954
4960 if (val < 0) /* race ? */ 4955 if (val < 0) /* race ? */
4961 val = 0; 4956 val = 0;
4962 return val; 4957 return val;
4963 } 4958 }
4964 4959
4965 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 4960 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
4966 { 4961 {
4967 u64 val; 4962 u64 val;
4968 4963
4969 if (!mem_cgroup_is_root(memcg)) { 4964 if (!mem_cgroup_is_root(memcg)) {
4970 if (!swap) 4965 if (!swap)
4971 return res_counter_read_u64(&memcg->res, RES_USAGE); 4966 return res_counter_read_u64(&memcg->res, RES_USAGE);
4972 else 4967 else
4973 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 4968 return res_counter_read_u64(&memcg->memsw, RES_USAGE);
4974 } 4969 }
4975 4970
4976 /* 4971 /*
4977 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 4972 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
4978 * as well as in MEM_CGROUP_STAT_RSS_HUGE. 4973 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
4979 */ 4974 */
4980 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 4975 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
4981 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 4976 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
4982 4977
4983 if (swap) 4978 if (swap)
4984 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 4979 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
4985 4980
4986 return val << PAGE_SHIFT; 4981 return val << PAGE_SHIFT;
4987 } 4982 }
4988 4983
4989 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, 4984 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
4990 struct cftype *cft) 4985 struct cftype *cft)
4991 { 4986 {
4992 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4987 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4993 u64 val; 4988 u64 val;
4994 int name; 4989 int name;
4995 enum res_type type; 4990 enum res_type type;
4996 4991
4997 type = MEMFILE_TYPE(cft->private); 4992 type = MEMFILE_TYPE(cft->private);
4998 name = MEMFILE_ATTR(cft->private); 4993 name = MEMFILE_ATTR(cft->private);
4999 4994
5000 switch (type) { 4995 switch (type) {
5001 case _MEM: 4996 case _MEM:
5002 if (name == RES_USAGE) 4997 if (name == RES_USAGE)
5003 val = mem_cgroup_usage(memcg, false); 4998 val = mem_cgroup_usage(memcg, false);
5004 else 4999 else
5005 val = res_counter_read_u64(&memcg->res, name); 5000 val = res_counter_read_u64(&memcg->res, name);
5006 break; 5001 break;
5007 case _MEMSWAP: 5002 case _MEMSWAP:
5008 if (name == RES_USAGE) 5003 if (name == RES_USAGE)
5009 val = mem_cgroup_usage(memcg, true); 5004 val = mem_cgroup_usage(memcg, true);
5010 else 5005 else
5011 val = res_counter_read_u64(&memcg->memsw, name); 5006 val = res_counter_read_u64(&memcg->memsw, name);
5012 break; 5007 break;
5013 case _KMEM: 5008 case _KMEM:
5014 val = res_counter_read_u64(&memcg->kmem, name); 5009 val = res_counter_read_u64(&memcg->kmem, name);
5015 break; 5010 break;
5016 default: 5011 default:
5017 BUG(); 5012 BUG();
5018 } 5013 }
5019 5014
5020 return val; 5015 return val;
5021 } 5016 }
5022 5017
5023 #ifdef CONFIG_MEMCG_KMEM 5018 #ifdef CONFIG_MEMCG_KMEM
5024 /* should be called with activate_kmem_mutex held */ 5019 /* should be called with activate_kmem_mutex held */
5025 static int __memcg_activate_kmem(struct mem_cgroup *memcg, 5020 static int __memcg_activate_kmem(struct mem_cgroup *memcg,
5026 unsigned long long limit) 5021 unsigned long long limit)
5027 { 5022 {
5028 int err = 0; 5023 int err = 0;
5029 int memcg_id; 5024 int memcg_id;
5030 5025
5031 if (memcg_kmem_is_active(memcg)) 5026 if (memcg_kmem_is_active(memcg))
5032 return 0; 5027 return 0;
5033 5028
5034 /* 5029 /*
5035 * We are going to allocate memory for data shared by all memory 5030 * We are going to allocate memory for data shared by all memory
5036 * cgroups so let's stop accounting here. 5031 * cgroups so let's stop accounting here.
5037 */ 5032 */
5038 memcg_stop_kmem_account(); 5033 memcg_stop_kmem_account();
5039 5034
5040 /* 5035 /*
5041 * For simplicity, we won't allow this to be disabled. It also can't 5036 * For simplicity, we won't allow this to be disabled. It also can't
5042 * be changed if the cgroup has children already, or if tasks had 5037 * be changed if the cgroup has children already, or if tasks had
5043 * already joined. 5038 * already joined.
5044 * 5039 *
5045 * If tasks join before we set the limit, a person looking at 5040 * If tasks join before we set the limit, a person looking at
5046 * kmem.usage_in_bytes will have no way to determine when it took 5041 * kmem.usage_in_bytes will have no way to determine when it took
5047 * place, which makes the value quite meaningless. 5042 * place, which makes the value quite meaningless.
5048 * 5043 *
5049 * After it first became limited, changes in the value of the limit are 5044 * After it first became limited, changes in the value of the limit are
5050 * of course permitted. 5045 * of course permitted.
5051 */ 5046 */
5052 mutex_lock(&memcg_create_mutex); 5047 mutex_lock(&memcg_create_mutex);
5053 if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) 5048 if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg))
5054 err = -EBUSY; 5049 err = -EBUSY;
5055 mutex_unlock(&memcg_create_mutex); 5050 mutex_unlock(&memcg_create_mutex);
5056 if (err) 5051 if (err)
5057 goto out; 5052 goto out;
5058 5053
5059 memcg_id = ida_simple_get(&kmem_limited_groups, 5054 memcg_id = ida_simple_get(&kmem_limited_groups,
5060 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 5055 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
5061 if (memcg_id < 0) { 5056 if (memcg_id < 0) {
5062 err = memcg_id; 5057 err = memcg_id;
5063 goto out; 5058 goto out;
5064 } 5059 }
5065 5060
5066 /* 5061 /*
5067 * Make sure we have enough space for this cgroup in each root cache's 5062 * Make sure we have enough space for this cgroup in each root cache's
5068 * memcg_params. 5063 * memcg_params.
5069 */ 5064 */
5070 err = memcg_update_all_caches(memcg_id + 1); 5065 err = memcg_update_all_caches(memcg_id + 1);
5071 if (err) 5066 if (err)
5072 goto out_rmid; 5067 goto out_rmid;
5073 5068
5074 memcg->kmemcg_id = memcg_id; 5069 memcg->kmemcg_id = memcg_id;
5075 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 5070 INIT_LIST_HEAD(&memcg->memcg_slab_caches);
5076 mutex_init(&memcg->slab_caches_mutex); 5071 mutex_init(&memcg->slab_caches_mutex);
5077 5072
5078 /* 5073 /*
5079 * We couldn't have accounted to this cgroup, because it hasn't got the 5074 * We couldn't have accounted to this cgroup, because it hasn't got the
5080 * active bit set yet, so this should succeed. 5075 * active bit set yet, so this should succeed.
5081 */ 5076 */
5082 err = res_counter_set_limit(&memcg->kmem, limit); 5077 err = res_counter_set_limit(&memcg->kmem, limit);
5083 VM_BUG_ON(err); 5078 VM_BUG_ON(err);
5084 5079
5085 static_key_slow_inc(&memcg_kmem_enabled_key); 5080 static_key_slow_inc(&memcg_kmem_enabled_key);
5086 /* 5081 /*
5087 * Setting the active bit after enabling static branching will 5082 * Setting the active bit after enabling static branching will
5088 * guarantee no one starts accounting before all call sites are 5083 * guarantee no one starts accounting before all call sites are
5089 * patched. 5084 * patched.
5090 */ 5085 */
5091 memcg_kmem_set_active(memcg); 5086 memcg_kmem_set_active(memcg);
5092 out: 5087 out:
5093 memcg_resume_kmem_account(); 5088 memcg_resume_kmem_account();
5094 return err; 5089 return err;
5095 5090
5096 out_rmid: 5091 out_rmid:
5097 ida_simple_remove(&kmem_limited_groups, memcg_id); 5092 ida_simple_remove(&kmem_limited_groups, memcg_id);
5098 goto out; 5093 goto out;
5099 } 5094 }
5100 5095
5101 static int memcg_activate_kmem(struct mem_cgroup *memcg, 5096 static int memcg_activate_kmem(struct mem_cgroup *memcg,
5102 unsigned long long limit) 5097 unsigned long long limit)
5103 { 5098 {
5104 int ret; 5099 int ret;
5105 5100
5106 mutex_lock(&activate_kmem_mutex); 5101 mutex_lock(&activate_kmem_mutex);
5107 ret = __memcg_activate_kmem(memcg, limit); 5102 ret = __memcg_activate_kmem(memcg, limit);
5108 mutex_unlock(&activate_kmem_mutex); 5103 mutex_unlock(&activate_kmem_mutex);
5109 return ret; 5104 return ret;
5110 } 5105 }
5111 5106
5112 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 5107 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5113 unsigned long long val) 5108 unsigned long long val)
5114 { 5109 {
5115 int ret; 5110 int ret;
5116 5111
5117 if (!memcg_kmem_is_active(memcg)) 5112 if (!memcg_kmem_is_active(memcg))
5118 ret = memcg_activate_kmem(memcg, val); 5113 ret = memcg_activate_kmem(memcg, val);
5119 else 5114 else
5120 ret = res_counter_set_limit(&memcg->kmem, val); 5115 ret = res_counter_set_limit(&memcg->kmem, val);
5121 return ret; 5116 return ret;
5122 } 5117 }
5123 5118
5124 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5119 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
5125 { 5120 {
5126 int ret = 0; 5121 int ret = 0;
5127 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5122 struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5128 5123
5129 if (!parent) 5124 if (!parent)
5130 return 0; 5125 return 0;
5131 5126
5132 mutex_lock(&activate_kmem_mutex); 5127 mutex_lock(&activate_kmem_mutex);
5133 /* 5128 /*
5134 * If the parent cgroup is not kmem-active now, it cannot be activated 5129 * If the parent cgroup is not kmem-active now, it cannot be activated
5135 * after this point, because it has at least one child already. 5130 * after this point, because it has at least one child already.
5136 */ 5131 */
5137 if (memcg_kmem_is_active(parent)) 5132 if (memcg_kmem_is_active(parent))
5138 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX); 5133 ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
5139 mutex_unlock(&activate_kmem_mutex); 5134 mutex_unlock(&activate_kmem_mutex);
5140 return ret; 5135 return ret;
5141 } 5136 }
5142 #else 5137 #else
5143 static int memcg_update_kmem_limit(struct mem_cgroup *memcg, 5138 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
5144 unsigned long long val) 5139 unsigned long long val)
5145 { 5140 {
5146 return -EINVAL; 5141 return -EINVAL;
5147 } 5142 }
5148 #endif /* CONFIG_MEMCG_KMEM */ 5143 #endif /* CONFIG_MEMCG_KMEM */
5149 5144
5150 /* 5145 /*
5151 * The user of this function is... 5146 * The user of this function is...
5152 * RES_LIMIT. 5147 * RES_LIMIT.
5153 */ 5148 */
5154 static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 5149 static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
5155 char *buffer) 5150 char *buffer)
5156 { 5151 {
5157 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5152 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5158 enum res_type type; 5153 enum res_type type;
5159 int name; 5154 int name;
5160 unsigned long long val; 5155 unsigned long long val;
5161 int ret; 5156 int ret;
5162 5157
5163 type = MEMFILE_TYPE(cft->private); 5158 type = MEMFILE_TYPE(cft->private);
5164 name = MEMFILE_ATTR(cft->private); 5159 name = MEMFILE_ATTR(cft->private);
5165 5160
5166 switch (name) { 5161 switch (name) {
5167 case RES_LIMIT: 5162 case RES_LIMIT:
5168 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 5163 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
5169 ret = -EINVAL; 5164 ret = -EINVAL;
5170 break; 5165 break;
5171 } 5166 }
5172 /* This function does all necessary parse...reuse it */ 5167 /* This function does all necessary parse...reuse it */
5173 ret = res_counter_memparse_write_strategy(buffer, &val); 5168 ret = res_counter_memparse_write_strategy(buffer, &val);
5174 if (ret) 5169 if (ret)
5175 break; 5170 break;
5176 if (type == _MEM) 5171 if (type == _MEM)
5177 ret = mem_cgroup_resize_limit(memcg, val); 5172 ret = mem_cgroup_resize_limit(memcg, val);
5178 else if (type == _MEMSWAP) 5173 else if (type == _MEMSWAP)
5179 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5174 ret = mem_cgroup_resize_memsw_limit(memcg, val);
5180 else if (type == _KMEM) 5175 else if (type == _KMEM)
5181 ret = memcg_update_kmem_limit(memcg, val); 5176 ret = memcg_update_kmem_limit(memcg, val);
5182 else 5177 else
5183 return -EINVAL; 5178 return -EINVAL;
5184 break; 5179 break;
5185 case RES_SOFT_LIMIT: 5180 case RES_SOFT_LIMIT:
5186 ret = res_counter_memparse_write_strategy(buffer, &val); 5181 ret = res_counter_memparse_write_strategy(buffer, &val);
5187 if (ret) 5182 if (ret)
5188 break; 5183 break;
5189 /* 5184 /*
5190 * For memsw, soft limits are hard to implement in terms 5185 * For memsw, soft limits are hard to implement in terms
5191 * of semantics, for now, we support soft limits for 5186 * of semantics, for now, we support soft limits for
5192 * control without swap 5187 * control without swap
5193 */ 5188 */
5194 if (type == _MEM) 5189 if (type == _MEM)
5195 ret = res_counter_set_soft_limit(&memcg->res, val); 5190 ret = res_counter_set_soft_limit(&memcg->res, val);
5196 else 5191 else
5197 ret = -EINVAL; 5192 ret = -EINVAL;
5198 break; 5193 break;
5199 default: 5194 default:
5200 ret = -EINVAL; /* should be BUG() ? */ 5195 ret = -EINVAL; /* should be BUG() ? */
5201 break; 5196 break;
5202 } 5197 }
5203 return ret; 5198 return ret;
5204 } 5199 }
5205 5200
5206 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5201 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
5207 unsigned long long *mem_limit, unsigned long long *memsw_limit) 5202 unsigned long long *mem_limit, unsigned long long *memsw_limit)
5208 { 5203 {
5209 unsigned long long min_limit, min_memsw_limit, tmp; 5204 unsigned long long min_limit, min_memsw_limit, tmp;
5210 5205
5211 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 5206 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
5212 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5207 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5213 if (!memcg->use_hierarchy) 5208 if (!memcg->use_hierarchy)
5214 goto out; 5209 goto out;
5215 5210
5216 while (css_parent(&memcg->css)) { 5211 while (css_parent(&memcg->css)) {
5217 memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 5212 memcg = mem_cgroup_from_css(css_parent(&memcg->css));
5218 if (!memcg->use_hierarchy) 5213 if (!memcg->use_hierarchy)
5219 break; 5214 break;
5220 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5215 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
5221 min_limit = min(min_limit, tmp); 5216 min_limit = min(min_limit, tmp);
5222 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5217 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
5223 min_memsw_limit = min(min_memsw_limit, tmp); 5218 min_memsw_limit = min(min_memsw_limit, tmp);
5224 } 5219 }
5225 out: 5220 out:
5226 *mem_limit = min_limit; 5221 *mem_limit = min_limit;
5227 *memsw_limit = min_memsw_limit; 5222 *memsw_limit = min_memsw_limit;
5228 } 5223 }
5229 5224
5230 static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) 5225 static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)
5231 { 5226 {
5232 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5227 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5233 int name; 5228 int name;
5234 enum res_type type; 5229 enum res_type type;
5235 5230
5236 type = MEMFILE_TYPE(event); 5231 type = MEMFILE_TYPE(event);
5237 name = MEMFILE_ATTR(event); 5232 name = MEMFILE_ATTR(event);
5238 5233
5239 switch (name) { 5234 switch (name) {
5240 case RES_MAX_USAGE: 5235 case RES_MAX_USAGE:
5241 if (type == _MEM) 5236 if (type == _MEM)
5242 res_counter_reset_max(&memcg->res); 5237 res_counter_reset_max(&memcg->res);
5243 else if (type == _MEMSWAP) 5238 else if (type == _MEMSWAP)
5244 res_counter_reset_max(&memcg->memsw); 5239 res_counter_reset_max(&memcg->memsw);
5245 else if (type == _KMEM) 5240 else if (type == _KMEM)
5246 res_counter_reset_max(&memcg->kmem); 5241 res_counter_reset_max(&memcg->kmem);
5247 else 5242 else
5248 return -EINVAL; 5243 return -EINVAL;
5249 break; 5244 break;
5250 case RES_FAILCNT: 5245 case RES_FAILCNT:
5251 if (type == _MEM) 5246 if (type == _MEM)
5252 res_counter_reset_failcnt(&memcg->res); 5247 res_counter_reset_failcnt(&memcg->res);
5253 else if (type == _MEMSWAP) 5248 else if (type == _MEMSWAP)
5254 res_counter_reset_failcnt(&memcg->memsw); 5249 res_counter_reset_failcnt(&memcg->memsw);
5255 else if (type == _KMEM) 5250 else if (type == _KMEM)
5256 res_counter_reset_failcnt(&memcg->kmem); 5251 res_counter_reset_failcnt(&memcg->kmem);
5257 else 5252 else
5258 return -EINVAL; 5253 return -EINVAL;
5259 break; 5254 break;
5260 } 5255 }
5261 5256
5262 return 0; 5257 return 0;
5263 } 5258 }
5264 5259
5265 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 5260 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
5266 struct cftype *cft) 5261 struct cftype *cft)
5267 { 5262 {
5268 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 5263 return mem_cgroup_from_css(css)->move_charge_at_immigrate;
5269 } 5264 }
5270 5265
5271 #ifdef CONFIG_MMU 5266 #ifdef CONFIG_MMU
5272 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 5267 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5273 struct cftype *cft, u64 val) 5268 struct cftype *cft, u64 val)
5274 { 5269 {
5275 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5270 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5276 5271
5277 if (val >= (1 << NR_MOVE_TYPE)) 5272 if (val >= (1 << NR_MOVE_TYPE))
5278 return -EINVAL; 5273 return -EINVAL;
5279 5274
5280 /* 5275 /*
5281 * No kind of locking is needed in here, because ->can_attach() will 5276 * No kind of locking is needed in here, because ->can_attach() will
5282 * check this value once in the beginning of the process, and then carry 5277 * check this value once in the beginning of the process, and then carry
5283 * on with stale data. This means that changes to this value will only 5278 * on with stale data. This means that changes to this value will only
5284 * affect task migrations starting after the change. 5279 * affect task migrations starting after the change.
5285 */ 5280 */
5286 memcg->move_charge_at_immigrate = val; 5281 memcg->move_charge_at_immigrate = val;
5287 return 0; 5282 return 0;
5288 } 5283 }
5289 #else 5284 #else
5290 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 5285 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
5291 struct cftype *cft, u64 val) 5286 struct cftype *cft, u64 val)
5292 { 5287 {
5293 return -ENOSYS; 5288 return -ENOSYS;
5294 } 5289 }
5295 #endif 5290 #endif
5296 5291
5297 #ifdef CONFIG_NUMA 5292 #ifdef CONFIG_NUMA
5298 static int memcg_numa_stat_show(struct seq_file *m, void *v) 5293 static int memcg_numa_stat_show(struct seq_file *m, void *v)
5299 { 5294 {
5300 struct numa_stat { 5295 struct numa_stat {
5301 const char *name; 5296 const char *name;
5302 unsigned int lru_mask; 5297 unsigned int lru_mask;
5303 }; 5298 };
5304 5299
5305 static const struct numa_stat stats[] = { 5300 static const struct numa_stat stats[] = {
5306 { "total", LRU_ALL }, 5301 { "total", LRU_ALL },
5307 { "file", LRU_ALL_FILE }, 5302 { "file", LRU_ALL_FILE },
5308 { "anon", LRU_ALL_ANON }, 5303 { "anon", LRU_ALL_ANON },
5309 { "unevictable", BIT(LRU_UNEVICTABLE) }, 5304 { "unevictable", BIT(LRU_UNEVICTABLE) },
5310 }; 5305 };
5311 const struct numa_stat *stat; 5306 const struct numa_stat *stat;
5312 int nid; 5307 int nid;
5313 unsigned long nr; 5308 unsigned long nr;
5314 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5309 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5315 5310
5316 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5311 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5317 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 5312 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
5318 seq_printf(m, "%s=%lu", stat->name, nr); 5313 seq_printf(m, "%s=%lu", stat->name, nr);
5319 for_each_node_state(nid, N_MEMORY) { 5314 for_each_node_state(nid, N_MEMORY) {
5320 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5315 nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
5321 stat->lru_mask); 5316 stat->lru_mask);
5322 seq_printf(m, " N%d=%lu", nid, nr); 5317 seq_printf(m, " N%d=%lu", nid, nr);
5323 } 5318 }
5324 seq_putc(m, '\n'); 5319 seq_putc(m, '\n');
5325 } 5320 }
5326 5321
5327 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5322 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
5328 struct mem_cgroup *iter; 5323 struct mem_cgroup *iter;
5329 5324
5330 nr = 0; 5325 nr = 0;
5331 for_each_mem_cgroup_tree(iter, memcg) 5326 for_each_mem_cgroup_tree(iter, memcg)
5332 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 5327 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
5333 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 5328 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
5334 for_each_node_state(nid, N_MEMORY) { 5329 for_each_node_state(nid, N_MEMORY) {
5335 nr = 0; 5330 nr = 0;
5336 for_each_mem_cgroup_tree(iter, memcg) 5331 for_each_mem_cgroup_tree(iter, memcg)
5337 nr += mem_cgroup_node_nr_lru_pages( 5332 nr += mem_cgroup_node_nr_lru_pages(
5338 iter, nid, stat->lru_mask); 5333 iter, nid, stat->lru_mask);
5339 seq_printf(m, " N%d=%lu", nid, nr); 5334 seq_printf(m, " N%d=%lu", nid, nr);
5340 } 5335 }
5341 seq_putc(m, '\n'); 5336 seq_putc(m, '\n');
5342 } 5337 }
5343 5338
5344 return 0; 5339 return 0;
5345 } 5340 }
5346 #endif /* CONFIG_NUMA */ 5341 #endif /* CONFIG_NUMA */
5347 5342
5348 static inline void mem_cgroup_lru_names_not_uptodate(void) 5343 static inline void mem_cgroup_lru_names_not_uptodate(void)
5349 { 5344 {
5350 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5345 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
5351 } 5346 }
5352 5347
5353 static int memcg_stat_show(struct seq_file *m, void *v) 5348 static int memcg_stat_show(struct seq_file *m, void *v)
5354 { 5349 {
5355 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); 5350 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5356 struct mem_cgroup *mi; 5351 struct mem_cgroup *mi;
5357 unsigned int i; 5352 unsigned int i;
5358 5353
5359 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5354 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5360 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5355 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5361 continue; 5356 continue;
5362 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 5357 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
5363 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 5358 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
5364 } 5359 }
5365 5360
5366 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 5361 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
5367 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 5362 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
5368 mem_cgroup_read_events(memcg, i)); 5363 mem_cgroup_read_events(memcg, i));
5369 5364
5370 for (i = 0; i < NR_LRU_LISTS; i++) 5365 for (i = 0; i < NR_LRU_LISTS; i++)
5371 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 5366 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
5372 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 5367 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
5373 5368
5374 /* Hierarchical information */ 5369 /* Hierarchical information */
5375 { 5370 {
5376 unsigned long long limit, memsw_limit; 5371 unsigned long long limit, memsw_limit;
5377 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 5372 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
5378 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 5373 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
5379 if (do_swap_account) 5374 if (do_swap_account)
5380 seq_printf(m, "hierarchical_memsw_limit %llu\n", 5375 seq_printf(m, "hierarchical_memsw_limit %llu\n",
5381 memsw_limit); 5376 memsw_limit);
5382 } 5377 }
5383 5378
5384 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5379 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
5385 long long val = 0; 5380 long long val = 0;
5386 5381
5387 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5382 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
5388 continue; 5383 continue;
5389 for_each_mem_cgroup_tree(mi, memcg) 5384 for_each_mem_cgroup_tree(mi, memcg)
5390 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 5385 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
5391 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 5386 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
5392 } 5387 }
5393 5388
5394 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 5389 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
5395 unsigned long long val = 0; 5390 unsigned long long val = 0;
5396 5391
5397 for_each_mem_cgroup_tree(mi, memcg) 5392 for_each_mem_cgroup_tree(mi, memcg)
5398 val += mem_cgroup_read_events(mi, i); 5393 val += mem_cgroup_read_events(mi, i);
5399 seq_printf(m, "total_%s %llu\n", 5394 seq_printf(m, "total_%s %llu\n",
5400 mem_cgroup_events_names[i], val); 5395 mem_cgroup_events_names[i], val);
5401 } 5396 }
5402 5397
5403 for (i = 0; i < NR_LRU_LISTS; i++) { 5398 for (i = 0; i < NR_LRU_LISTS; i++) {
5404 unsigned long long val = 0; 5399 unsigned long long val = 0;
5405 5400
5406 for_each_mem_cgroup_tree(mi, memcg) 5401 for_each_mem_cgroup_tree(mi, memcg)
5407 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 5402 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
5408 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 5403 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
5409 } 5404 }
5410 5405
5411 #ifdef CONFIG_DEBUG_VM 5406 #ifdef CONFIG_DEBUG_VM
5412 { 5407 {
5413 int nid, zid; 5408 int nid, zid;
5414 struct mem_cgroup_per_zone *mz; 5409 struct mem_cgroup_per_zone *mz;
5415 struct zone_reclaim_stat *rstat; 5410 struct zone_reclaim_stat *rstat;
5416 unsigned long recent_rotated[2] = {0, 0}; 5411 unsigned long recent_rotated[2] = {0, 0};
5417 unsigned long recent_scanned[2] = {0, 0}; 5412 unsigned long recent_scanned[2] = {0, 0};
5418 5413
5419 for_each_online_node(nid) 5414 for_each_online_node(nid)
5420 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 5415 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
5421 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 5416 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
5422 rstat = &mz->lruvec.reclaim_stat; 5417 rstat = &mz->lruvec.reclaim_stat;
5423 5418
5424 recent_rotated[0] += rstat->recent_rotated[0]; 5419 recent_rotated[0] += rstat->recent_rotated[0];
5425 recent_rotated[1] += rstat->recent_rotated[1]; 5420 recent_rotated[1] += rstat->recent_rotated[1];
5426 recent_scanned[0] += rstat->recent_scanned[0]; 5421 recent_scanned[0] += rstat->recent_scanned[0];
5427 recent_scanned[1] += rstat->recent_scanned[1]; 5422 recent_scanned[1] += rstat->recent_scanned[1];
5428 } 5423 }
5429 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 5424 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
5430 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 5425 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
5431 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 5426 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
5432 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 5427 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
5433 } 5428 }
5434 #endif 5429 #endif
5435 5430
5436 return 0; 5431 return 0;
5437 } 5432 }
5438 5433
5439 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 5434 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
5440 struct cftype *cft) 5435 struct cftype *cft)
5441 { 5436 {
5442 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5437 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5443 5438
5444 return mem_cgroup_swappiness(memcg); 5439 return mem_cgroup_swappiness(memcg);
5445 } 5440 }
5446 5441
5447 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 5442 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
5448 struct cftype *cft, u64 val) 5443 struct cftype *cft, u64 val)
5449 { 5444 {
5450 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5445 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5451 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); 5446 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5452 5447
5453 if (val > 100 || !parent) 5448 if (val > 100 || !parent)
5454 return -EINVAL; 5449 return -EINVAL;
5455 5450
5456 mutex_lock(&memcg_create_mutex); 5451 mutex_lock(&memcg_create_mutex);
5457 5452
5458 /* If under hierarchy, only empty-root can set this value */ 5453 /* If under hierarchy, only empty-root can set this value */
5459 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5454 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5460 mutex_unlock(&memcg_create_mutex); 5455 mutex_unlock(&memcg_create_mutex);
5461 return -EINVAL; 5456 return -EINVAL;
5462 } 5457 }
5463 5458
5464 memcg->swappiness = val; 5459 memcg->swappiness = val;
5465 5460
5466 mutex_unlock(&memcg_create_mutex); 5461 mutex_unlock(&memcg_create_mutex);
5467 5462
5468 return 0; 5463 return 0;
5469 } 5464 }
5470 5465
5471 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 5466 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
5472 { 5467 {
5473 struct mem_cgroup_threshold_ary *t; 5468 struct mem_cgroup_threshold_ary *t;
5474 u64 usage; 5469 u64 usage;
5475 int i; 5470 int i;
5476 5471
5477 rcu_read_lock(); 5472 rcu_read_lock();
5478 if (!swap) 5473 if (!swap)
5479 t = rcu_dereference(memcg->thresholds.primary); 5474 t = rcu_dereference(memcg->thresholds.primary);
5480 else 5475 else
5481 t = rcu_dereference(memcg->memsw_thresholds.primary); 5476 t = rcu_dereference(memcg->memsw_thresholds.primary);
5482 5477
5483 if (!t) 5478 if (!t)
5484 goto unlock; 5479 goto unlock;
5485 5480
5486 usage = mem_cgroup_usage(memcg, swap); 5481 usage = mem_cgroup_usage(memcg, swap);
5487 5482
5488 /* 5483 /*
5489 * current_threshold points to threshold just below or equal to usage. 5484 * current_threshold points to threshold just below or equal to usage.
5490 * If it's not true, a threshold was crossed after last 5485 * If it's not true, a threshold was crossed after last
5491 * call of __mem_cgroup_threshold(). 5486 * call of __mem_cgroup_threshold().
5492 */ 5487 */
5493 i = t->current_threshold; 5488 i = t->current_threshold;
5494 5489
5495 /* 5490 /*
5496 * Iterate backward over array of thresholds starting from 5491 * Iterate backward over array of thresholds starting from
5497 * current_threshold and check if a threshold is crossed. 5492 * current_threshold and check if a threshold is crossed.
5498 * If none of thresholds below usage is crossed, we read 5493 * If none of thresholds below usage is crossed, we read
5499 * only one element of the array here. 5494 * only one element of the array here.
5500 */ 5495 */
5501 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 5496 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
5502 eventfd_signal(t->entries[i].eventfd, 1); 5497 eventfd_signal(t->entries[i].eventfd, 1);
5503 5498
5504 /* i = current_threshold + 1 */ 5499 /* i = current_threshold + 1 */
5505 i++; 5500 i++;
5506 5501
5507 /* 5502 /*
5508 * Iterate forward over array of thresholds starting from 5503 * Iterate forward over array of thresholds starting from
5509 * current_threshold+1 and check if a threshold is crossed. 5504 * current_threshold+1 and check if a threshold is crossed.
5510 * If none of thresholds above usage is crossed, we read 5505 * If none of thresholds above usage is crossed, we read
5511 * only one element of the array here. 5506 * only one element of the array here.
5512 */ 5507 */
5513 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 5508 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
5514 eventfd_signal(t->entries[i].eventfd, 1); 5509 eventfd_signal(t->entries[i].eventfd, 1);
5515 5510
5516 /* Update current_threshold */ 5511 /* Update current_threshold */
5517 t->current_threshold = i - 1; 5512 t->current_threshold = i - 1;
5518 unlock: 5513 unlock:
5519 rcu_read_unlock(); 5514 rcu_read_unlock();
5520 } 5515 }
5521 5516
5522 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 5517 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
5523 { 5518 {
5524 while (memcg) { 5519 while (memcg) {
5525 __mem_cgroup_threshold(memcg, false); 5520 __mem_cgroup_threshold(memcg, false);
5526 if (do_swap_account) 5521 if (do_swap_account)
5527 __mem_cgroup_threshold(memcg, true); 5522 __mem_cgroup_threshold(memcg, true);
5528 5523
5529 memcg = parent_mem_cgroup(memcg); 5524 memcg = parent_mem_cgroup(memcg);
5530 } 5525 }
5531 } 5526 }
5532 5527
5533 static int compare_thresholds(const void *a, const void *b) 5528 static int compare_thresholds(const void *a, const void *b)
5534 { 5529 {
5535 const struct mem_cgroup_threshold *_a = a; 5530 const struct mem_cgroup_threshold *_a = a;
5536 const struct mem_cgroup_threshold *_b = b; 5531 const struct mem_cgroup_threshold *_b = b;
5537 5532
5538 if (_a->threshold > _b->threshold) 5533 if (_a->threshold > _b->threshold)
5539 return 1; 5534 return 1;
5540 5535
5541 if (_a->threshold < _b->threshold) 5536 if (_a->threshold < _b->threshold)
5542 return -1; 5537 return -1;
5543 5538
5544 return 0; 5539 return 0;
5545 } 5540 }
5546 5541
5547 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 5542 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
5548 { 5543 {
5549 struct mem_cgroup_eventfd_list *ev; 5544 struct mem_cgroup_eventfd_list *ev;
5550 5545
5551 list_for_each_entry(ev, &memcg->oom_notify, list) 5546 list_for_each_entry(ev, &memcg->oom_notify, list)
5552 eventfd_signal(ev->eventfd, 1); 5547 eventfd_signal(ev->eventfd, 1);
5553 return 0; 5548 return 0;
5554 } 5549 }
5555 5550
5556 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 5551 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
5557 { 5552 {
5558 struct mem_cgroup *iter; 5553 struct mem_cgroup *iter;
5559 5554
5560 for_each_mem_cgroup_tree(iter, memcg) 5555 for_each_mem_cgroup_tree(iter, memcg)
5561 mem_cgroup_oom_notify_cb(iter); 5556 mem_cgroup_oom_notify_cb(iter);
5562 } 5557 }
5563 5558
5564 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 5559 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5565 struct eventfd_ctx *eventfd, const char *args, enum res_type type) 5560 struct eventfd_ctx *eventfd, const char *args, enum res_type type)
5566 { 5561 {
5567 struct mem_cgroup_thresholds *thresholds; 5562 struct mem_cgroup_thresholds *thresholds;
5568 struct mem_cgroup_threshold_ary *new; 5563 struct mem_cgroup_threshold_ary *new;
5569 u64 threshold, usage; 5564 u64 threshold, usage;
5570 int i, size, ret; 5565 int i, size, ret;
5571 5566
5572 ret = res_counter_memparse_write_strategy(args, &threshold); 5567 ret = res_counter_memparse_write_strategy(args, &threshold);
5573 if (ret) 5568 if (ret)
5574 return ret; 5569 return ret;
5575 5570
5576 mutex_lock(&memcg->thresholds_lock); 5571 mutex_lock(&memcg->thresholds_lock);
5577 5572
5578 if (type == _MEM) 5573 if (type == _MEM)
5579 thresholds = &memcg->thresholds; 5574 thresholds = &memcg->thresholds;
5580 else if (type == _MEMSWAP) 5575 else if (type == _MEMSWAP)
5581 thresholds = &memcg->memsw_thresholds; 5576 thresholds = &memcg->memsw_thresholds;
5582 else 5577 else
5583 BUG(); 5578 BUG();
5584 5579
5585 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5580 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5586 5581
5587 /* Check if a threshold crossed before adding a new one */ 5582 /* Check if a threshold crossed before adding a new one */
5588 if (thresholds->primary) 5583 if (thresholds->primary)
5589 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5584 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5590 5585
5591 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 5586 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
5592 5587
5593 /* Allocate memory for new array of thresholds */ 5588 /* Allocate memory for new array of thresholds */
5594 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 5589 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
5595 GFP_KERNEL); 5590 GFP_KERNEL);
5596 if (!new) { 5591 if (!new) {
5597 ret = -ENOMEM; 5592 ret = -ENOMEM;
5598 goto unlock; 5593 goto unlock;
5599 } 5594 }
5600 new->size = size; 5595 new->size = size;
5601 5596
5602 /* Copy thresholds (if any) to new array */ 5597 /* Copy thresholds (if any) to new array */
5603 if (thresholds->primary) { 5598 if (thresholds->primary) {
5604 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 5599 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
5605 sizeof(struct mem_cgroup_threshold)); 5600 sizeof(struct mem_cgroup_threshold));
5606 } 5601 }
5607 5602
5608 /* Add new threshold */ 5603 /* Add new threshold */
5609 new->entries[size - 1].eventfd = eventfd; 5604 new->entries[size - 1].eventfd = eventfd;
5610 new->entries[size - 1].threshold = threshold; 5605 new->entries[size - 1].threshold = threshold;
5611 5606
5612 /* Sort thresholds. Registering of new threshold isn't time-critical */ 5607 /* Sort thresholds. Registering of new threshold isn't time-critical */
5613 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 5608 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
5614 compare_thresholds, NULL); 5609 compare_thresholds, NULL);
5615 5610
5616 /* Find current threshold */ 5611 /* Find current threshold */
5617 new->current_threshold = -1; 5612 new->current_threshold = -1;
5618 for (i = 0; i < size; i++) { 5613 for (i = 0; i < size; i++) {
5619 if (new->entries[i].threshold <= usage) { 5614 if (new->entries[i].threshold <= usage) {
5620 /* 5615 /*
5621 * new->current_threshold will not be used until 5616 * new->current_threshold will not be used until
5622 * rcu_assign_pointer(), so it's safe to increment 5617 * rcu_assign_pointer(), so it's safe to increment
5623 * it here. 5618 * it here.
5624 */ 5619 */
5625 ++new->current_threshold; 5620 ++new->current_threshold;
5626 } else 5621 } else
5627 break; 5622 break;
5628 } 5623 }
5629 5624
5630 /* Free old spare buffer and save old primary buffer as spare */ 5625 /* Free old spare buffer and save old primary buffer as spare */
5631 kfree(thresholds->spare); 5626 kfree(thresholds->spare);
5632 thresholds->spare = thresholds->primary; 5627 thresholds->spare = thresholds->primary;
5633 5628
5634 rcu_assign_pointer(thresholds->primary, new); 5629 rcu_assign_pointer(thresholds->primary, new);
5635 5630
5636 /* To be sure that nobody uses thresholds */ 5631 /* To be sure that nobody uses thresholds */
5637 synchronize_rcu(); 5632 synchronize_rcu();
5638 5633
5639 unlock: 5634 unlock:
5640 mutex_unlock(&memcg->thresholds_lock); 5635 mutex_unlock(&memcg->thresholds_lock);
5641 5636
5642 return ret; 5637 return ret;
5643 } 5638 }
5644 5639
5645 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg, 5640 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
5646 struct eventfd_ctx *eventfd, const char *args) 5641 struct eventfd_ctx *eventfd, const char *args)
5647 { 5642 {
5648 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM); 5643 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
5649 } 5644 }
5650 5645
5651 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg, 5646 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
5652 struct eventfd_ctx *eventfd, const char *args) 5647 struct eventfd_ctx *eventfd, const char *args)
5653 { 5648 {
5654 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP); 5649 return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
5655 } 5650 }
5656 5651
5657 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5652 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5658 struct eventfd_ctx *eventfd, enum res_type type) 5653 struct eventfd_ctx *eventfd, enum res_type type)
5659 { 5654 {
5660 struct mem_cgroup_thresholds *thresholds; 5655 struct mem_cgroup_thresholds *thresholds;
5661 struct mem_cgroup_threshold_ary *new; 5656 struct mem_cgroup_threshold_ary *new;
5662 u64 usage; 5657 u64 usage;
5663 int i, j, size; 5658 int i, j, size;
5664 5659
5665 mutex_lock(&memcg->thresholds_lock); 5660 mutex_lock(&memcg->thresholds_lock);
5666 if (type == _MEM) 5661 if (type == _MEM)
5667 thresholds = &memcg->thresholds; 5662 thresholds = &memcg->thresholds;
5668 else if (type == _MEMSWAP) 5663 else if (type == _MEMSWAP)
5669 thresholds = &memcg->memsw_thresholds; 5664 thresholds = &memcg->memsw_thresholds;
5670 else 5665 else
5671 BUG(); 5666 BUG();
5672 5667
5673 if (!thresholds->primary) 5668 if (!thresholds->primary)
5674 goto unlock; 5669 goto unlock;
5675 5670
5676 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5671 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
5677 5672
5678 /* Check if a threshold crossed before removing */ 5673 /* Check if a threshold crossed before removing */
5679 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5674 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
5680 5675
5681 /* Calculate new number of threshold */ 5676 /* Calculate new number of threshold */
5682 size = 0; 5677 size = 0;
5683 for (i = 0; i < thresholds->primary->size; i++) { 5678 for (i = 0; i < thresholds->primary->size; i++) {
5684 if (thresholds->primary->entries[i].eventfd != eventfd) 5679 if (thresholds->primary->entries[i].eventfd != eventfd)
5685 size++; 5680 size++;
5686 } 5681 }
5687 5682
5688 new = thresholds->spare; 5683 new = thresholds->spare;
5689 5684
5690 /* Set thresholds array to NULL if we don't have thresholds */ 5685 /* Set thresholds array to NULL if we don't have thresholds */
5691 if (!size) { 5686 if (!size) {
5692 kfree(new); 5687 kfree(new);
5693 new = NULL; 5688 new = NULL;
5694 goto swap_buffers; 5689 goto swap_buffers;
5695 } 5690 }
5696 5691
5697 new->size = size; 5692 new->size = size;
5698 5693
5699 /* Copy thresholds and find current threshold */ 5694 /* Copy thresholds and find current threshold */
5700 new->current_threshold = -1; 5695 new->current_threshold = -1;
5701 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 5696 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
5702 if (thresholds->primary->entries[i].eventfd == eventfd) 5697 if (thresholds->primary->entries[i].eventfd == eventfd)
5703 continue; 5698 continue;
5704 5699
5705 new->entries[j] = thresholds->primary->entries[i]; 5700 new->entries[j] = thresholds->primary->entries[i];
5706 if (new->entries[j].threshold <= usage) { 5701 if (new->entries[j].threshold <= usage) {
5707 /* 5702 /*
5708 * new->current_threshold will not be used 5703 * new->current_threshold will not be used
5709 * until rcu_assign_pointer(), so it's safe to increment 5704 * until rcu_assign_pointer(), so it's safe to increment
5710 * it here. 5705 * it here.
5711 */ 5706 */
5712 ++new->current_threshold; 5707 ++new->current_threshold;
5713 } 5708 }
5714 j++; 5709 j++;
5715 } 5710 }
5716 5711
5717 swap_buffers: 5712 swap_buffers:
5718 /* Swap primary and spare array */ 5713 /* Swap primary and spare array */
5719 thresholds->spare = thresholds->primary; 5714 thresholds->spare = thresholds->primary;
5720 /* If all events are unregistered, free the spare array */ 5715 /* If all events are unregistered, free the spare array */
5721 if (!new) { 5716 if (!new) {
5722 kfree(thresholds->spare); 5717 kfree(thresholds->spare);
5723 thresholds->spare = NULL; 5718 thresholds->spare = NULL;
5724 } 5719 }
5725 5720
5726 rcu_assign_pointer(thresholds->primary, new); 5721 rcu_assign_pointer(thresholds->primary, new);
5727 5722
5728 /* To be sure that nobody uses thresholds */ 5723 /* To be sure that nobody uses thresholds */
5729 synchronize_rcu(); 5724 synchronize_rcu();
5730 unlock: 5725 unlock:
5731 mutex_unlock(&memcg->thresholds_lock); 5726 mutex_unlock(&memcg->thresholds_lock);
5732 } 5727 }
5733 5728
5734 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5729 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5735 struct eventfd_ctx *eventfd) 5730 struct eventfd_ctx *eventfd)
5736 { 5731 {
5737 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM); 5732 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
5738 } 5733 }
5739 5734
5740 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg, 5735 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
5741 struct eventfd_ctx *eventfd) 5736 struct eventfd_ctx *eventfd)
5742 { 5737 {
5743 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP); 5738 return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
5744 } 5739 }
5745 5740
5746 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg, 5741 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
5747 struct eventfd_ctx *eventfd, const char *args) 5742 struct eventfd_ctx *eventfd, const char *args)
5748 { 5743 {
5749 struct mem_cgroup_eventfd_list *event; 5744 struct mem_cgroup_eventfd_list *event;
5750 5745
5751 event = kmalloc(sizeof(*event), GFP_KERNEL); 5746 event = kmalloc(sizeof(*event), GFP_KERNEL);
5752 if (!event) 5747 if (!event)
5753 return -ENOMEM; 5748 return -ENOMEM;
5754 5749
5755 spin_lock(&memcg_oom_lock); 5750 spin_lock(&memcg_oom_lock);
5756 5751
5757 event->eventfd = eventfd; 5752 event->eventfd = eventfd;
5758 list_add(&event->list, &memcg->oom_notify); 5753 list_add(&event->list, &memcg->oom_notify);
5759 5754
5760 /* already in OOM ? */ 5755 /* already in OOM ? */
5761 if (atomic_read(&memcg->under_oom)) 5756 if (atomic_read(&memcg->under_oom))
5762 eventfd_signal(eventfd, 1); 5757 eventfd_signal(eventfd, 1);
5763 spin_unlock(&memcg_oom_lock); 5758 spin_unlock(&memcg_oom_lock);
5764 5759
5765 return 0; 5760 return 0;
5766 } 5761 }
5767 5762
5768 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg, 5763 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
5769 struct eventfd_ctx *eventfd) 5764 struct eventfd_ctx *eventfd)
5770 { 5765 {
5771 struct mem_cgroup_eventfd_list *ev, *tmp; 5766 struct mem_cgroup_eventfd_list *ev, *tmp;
5772 5767
5773 spin_lock(&memcg_oom_lock); 5768 spin_lock(&memcg_oom_lock);
5774 5769
5775 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 5770 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
5776 if (ev->eventfd == eventfd) { 5771 if (ev->eventfd == eventfd) {
5777 list_del(&ev->list); 5772 list_del(&ev->list);
5778 kfree(ev); 5773 kfree(ev);
5779 } 5774 }
5780 } 5775 }
5781 5776
5782 spin_unlock(&memcg_oom_lock); 5777 spin_unlock(&memcg_oom_lock);
5783 } 5778 }
5784 5779
5785 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v) 5780 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
5786 { 5781 {
5787 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 5782 struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
5788 5783
5789 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable); 5784 seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
5790 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom)); 5785 seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
5791 return 0; 5786 return 0;
5792 } 5787 }
5793 5788
5794 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 5789 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
5795 struct cftype *cft, u64 val) 5790 struct cftype *cft, u64 val)
5796 { 5791 {
5797 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5792 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5798 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); 5793 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));
5799 5794
5800 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5795 /* cannot set to root cgroup and only 0 and 1 are allowed */
5801 if (!parent || !((val == 0) || (val == 1))) 5796 if (!parent || !((val == 0) || (val == 1)))
5802 return -EINVAL; 5797 return -EINVAL;
5803 5798
5804 mutex_lock(&memcg_create_mutex); 5799 mutex_lock(&memcg_create_mutex);
5805 /* oom-kill-disable is a flag for subhierarchy. */ 5800 /* oom-kill-disable is a flag for subhierarchy. */
5806 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5801 if ((parent->use_hierarchy) || memcg_has_children(memcg)) {
5807 mutex_unlock(&memcg_create_mutex); 5802 mutex_unlock(&memcg_create_mutex);
5808 return -EINVAL; 5803 return -EINVAL;
5809 } 5804 }
5810 memcg->oom_kill_disable = val; 5805 memcg->oom_kill_disable = val;
5811 if (!val) 5806 if (!val)
5812 memcg_oom_recover(memcg); 5807 memcg_oom_recover(memcg);
5813 mutex_unlock(&memcg_create_mutex); 5808 mutex_unlock(&memcg_create_mutex);
5814 return 0; 5809 return 0;
5815 } 5810 }
5816 5811
5817 #ifdef CONFIG_MEMCG_KMEM 5812 #ifdef CONFIG_MEMCG_KMEM
5818 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5813 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5819 { 5814 {
5820 int ret; 5815 int ret;
5821 5816
5822 memcg->kmemcg_id = -1; 5817 memcg->kmemcg_id = -1;
5823 ret = memcg_propagate_kmem(memcg); 5818 ret = memcg_propagate_kmem(memcg);
5824 if (ret) 5819 if (ret)
5825 return ret; 5820 return ret;
5826 5821
5827 return mem_cgroup_sockets_init(memcg, ss); 5822 return mem_cgroup_sockets_init(memcg, ss);
5828 } 5823 }
5829 5824
5830 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5825 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5831 { 5826 {
5832 mem_cgroup_sockets_destroy(memcg); 5827 mem_cgroup_sockets_destroy(memcg);
5833 } 5828 }
5834 5829
5835 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5830 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5836 { 5831 {
5837 if (!memcg_kmem_is_active(memcg)) 5832 if (!memcg_kmem_is_active(memcg))
5838 return; 5833 return;
5839 5834
5840 /* 5835 /*
5841 * kmem charges can outlive the cgroup. In the case of slab 5836 * kmem charges can outlive the cgroup. In the case of slab
5842 * pages, for instance, a page contain objects from various 5837 * pages, for instance, a page contain objects from various
5843 * processes. As we prevent from taking a reference for every 5838 * processes. As we prevent from taking a reference for every
5844 * such allocation we have to be careful when doing uncharge 5839 * such allocation we have to be careful when doing uncharge
5845 * (see memcg_uncharge_kmem) and here during offlining. 5840 * (see memcg_uncharge_kmem) and here during offlining.
5846 * 5841 *
5847 * The idea is that that only the _last_ uncharge which sees 5842 * The idea is that that only the _last_ uncharge which sees
5848 * the dead memcg will drop the last reference. An additional 5843 * the dead memcg will drop the last reference. An additional
5849 * reference is taken here before the group is marked dead 5844 * reference is taken here before the group is marked dead
5850 * which is then paired with css_put during uncharge resp. here. 5845 * which is then paired with css_put during uncharge resp. here.
5851 * 5846 *
5852 * Although this might sound strange as this path is called from 5847 * Although this might sound strange as this path is called from
5853 * css_offline() when the referencemight have dropped down to 0 5848 * css_offline() when the referencemight have dropped down to 0
5854 * and shouldn't be incremented anymore (css_tryget would fail) 5849 * and shouldn't be incremented anymore (css_tryget would fail)
5855 * we do not have other options because of the kmem allocations 5850 * we do not have other options because of the kmem allocations
5856 * lifetime. 5851 * lifetime.
5857 */ 5852 */
5858 css_get(&memcg->css); 5853 css_get(&memcg->css);
5859 5854
5860 memcg_kmem_mark_dead(memcg); 5855 memcg_kmem_mark_dead(memcg);
5861 5856
5862 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 5857 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
5863 return; 5858 return;
5864 5859
5865 if (memcg_kmem_test_and_clear_dead(memcg)) 5860 if (memcg_kmem_test_and_clear_dead(memcg))
5866 css_put(&memcg->css); 5861 css_put(&memcg->css);
5867 } 5862 }
5868 #else 5863 #else
5869 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5864 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
5870 { 5865 {
5871 return 0; 5866 return 0;
5872 } 5867 }
5873 5868
5874 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5869 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
5875 { 5870 {
5876 } 5871 }
5877 5872
5878 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5873 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
5879 { 5874 {
5880 } 5875 }
5881 #endif 5876 #endif
5882 5877
5883 /* 5878 /*
5884 * DO NOT USE IN NEW FILES. 5879 * DO NOT USE IN NEW FILES.
5885 * 5880 *
5886 * "cgroup.event_control" implementation. 5881 * "cgroup.event_control" implementation.
5887 * 5882 *
5888 * This is way over-engineered. It tries to support fully configurable 5883 * This is way over-engineered. It tries to support fully configurable
5889 * events for each user. Such level of flexibility is completely 5884 * events for each user. Such level of flexibility is completely
5890 * unnecessary especially in the light of the planned unified hierarchy. 5885 * unnecessary especially in the light of the planned unified hierarchy.
5891 * 5886 *
5892 * Please deprecate this and replace with something simpler if at all 5887 * Please deprecate this and replace with something simpler if at all
5893 * possible. 5888 * possible.
5894 */ 5889 */
5895 5890
5896 /* 5891 /*
5897 * Unregister event and free resources. 5892 * Unregister event and free resources.
5898 * 5893 *
5899 * Gets called from workqueue. 5894 * Gets called from workqueue.
5900 */ 5895 */
5901 static void memcg_event_remove(struct work_struct *work) 5896 static void memcg_event_remove(struct work_struct *work)
5902 { 5897 {
5903 struct mem_cgroup_event *event = 5898 struct mem_cgroup_event *event =
5904 container_of(work, struct mem_cgroup_event, remove); 5899 container_of(work, struct mem_cgroup_event, remove);
5905 struct mem_cgroup *memcg = event->memcg; 5900 struct mem_cgroup *memcg = event->memcg;
5906 5901
5907 remove_wait_queue(event->wqh, &event->wait); 5902 remove_wait_queue(event->wqh, &event->wait);
5908 5903
5909 event->unregister_event(memcg, event->eventfd); 5904 event->unregister_event(memcg, event->eventfd);
5910 5905
5911 /* Notify userspace the event is going away. */ 5906 /* Notify userspace the event is going away. */
5912 eventfd_signal(event->eventfd, 1); 5907 eventfd_signal(event->eventfd, 1);
5913 5908
5914 eventfd_ctx_put(event->eventfd); 5909 eventfd_ctx_put(event->eventfd);
5915 kfree(event); 5910 kfree(event);
5916 css_put(&memcg->css); 5911 css_put(&memcg->css);
5917 } 5912 }
5918 5913
5919 /* 5914 /*
5920 * Gets called on POLLHUP on eventfd when user closes it. 5915 * Gets called on POLLHUP on eventfd when user closes it.
5921 * 5916 *
5922 * Called with wqh->lock held and interrupts disabled. 5917 * Called with wqh->lock held and interrupts disabled.
5923 */ 5918 */
5924 static int memcg_event_wake(wait_queue_t *wait, unsigned mode, 5919 static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
5925 int sync, void *key) 5920 int sync, void *key)
5926 { 5921 {
5927 struct mem_cgroup_event *event = 5922 struct mem_cgroup_event *event =
5928 container_of(wait, struct mem_cgroup_event, wait); 5923 container_of(wait, struct mem_cgroup_event, wait);
5929 struct mem_cgroup *memcg = event->memcg; 5924 struct mem_cgroup *memcg = event->memcg;
5930 unsigned long flags = (unsigned long)key; 5925 unsigned long flags = (unsigned long)key;
5931 5926
5932 if (flags & POLLHUP) { 5927 if (flags & POLLHUP) {
5933 /* 5928 /*
5934 * If the event has been detached at cgroup removal, we 5929 * If the event has been detached at cgroup removal, we
5935 * can simply return knowing the other side will cleanup 5930 * can simply return knowing the other side will cleanup
5936 * for us. 5931 * for us.
5937 * 5932 *
5938 * We can't race against event freeing since the other 5933 * We can't race against event freeing since the other
5939 * side will require wqh->lock via remove_wait_queue(), 5934 * side will require wqh->lock via remove_wait_queue(),
5940 * which we hold. 5935 * which we hold.
5941 */ 5936 */
5942 spin_lock(&memcg->event_list_lock); 5937 spin_lock(&memcg->event_list_lock);
5943 if (!list_empty(&event->list)) { 5938 if (!list_empty(&event->list)) {
5944 list_del_init(&event->list); 5939 list_del_init(&event->list);
5945 /* 5940 /*
5946 * We are in atomic context, but cgroup_event_remove() 5941 * We are in atomic context, but cgroup_event_remove()
5947 * may sleep, so we have to call it in workqueue. 5942 * may sleep, so we have to call it in workqueue.
5948 */ 5943 */
5949 schedule_work(&event->remove); 5944 schedule_work(&event->remove);
5950 } 5945 }
5951 spin_unlock(&memcg->event_list_lock); 5946 spin_unlock(&memcg->event_list_lock);
5952 } 5947 }
5953 5948
5954 return 0; 5949 return 0;
5955 } 5950 }
5956 5951
5957 static void memcg_event_ptable_queue_proc(struct file *file, 5952 static void memcg_event_ptable_queue_proc(struct file *file,
5958 wait_queue_head_t *wqh, poll_table *pt) 5953 wait_queue_head_t *wqh, poll_table *pt)
5959 { 5954 {
5960 struct mem_cgroup_event *event = 5955 struct mem_cgroup_event *event =
5961 container_of(pt, struct mem_cgroup_event, pt); 5956 container_of(pt, struct mem_cgroup_event, pt);
5962 5957
5963 event->wqh = wqh; 5958 event->wqh = wqh;
5964 add_wait_queue(wqh, &event->wait); 5959 add_wait_queue(wqh, &event->wait);
5965 } 5960 }
5966 5961
5967 /* 5962 /*
5968 * DO NOT USE IN NEW FILES. 5963 * DO NOT USE IN NEW FILES.
5969 * 5964 *
5970 * Parse input and register new cgroup event handler. 5965 * Parse input and register new cgroup event handler.
5971 * 5966 *
5972 * Input must be in format '<event_fd> <control_fd> <args>'. 5967 * Input must be in format '<event_fd> <control_fd> <args>'.
5973 * Interpretation of args is defined by control file implementation. 5968 * Interpretation of args is defined by control file implementation.
5974 */ 5969 */
5975 static int memcg_write_event_control(struct cgroup_subsys_state *css, 5970 static int memcg_write_event_control(struct cgroup_subsys_state *css,
5976 struct cftype *cft, char *buffer) 5971 struct cftype *cft, char *buffer)
5977 { 5972 {
5978 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5973 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5979 struct mem_cgroup_event *event; 5974 struct mem_cgroup_event *event;
5980 struct cgroup_subsys_state *cfile_css; 5975 struct cgroup_subsys_state *cfile_css;
5981 unsigned int efd, cfd; 5976 unsigned int efd, cfd;
5982 struct fd efile; 5977 struct fd efile;
5983 struct fd cfile; 5978 struct fd cfile;
5984 const char *name; 5979 const char *name;
5985 char *endp; 5980 char *endp;
5986 int ret; 5981 int ret;
5987 5982
5988 efd = simple_strtoul(buffer, &endp, 10); 5983 efd = simple_strtoul(buffer, &endp, 10);
5989 if (*endp != ' ') 5984 if (*endp != ' ')
5990 return -EINVAL; 5985 return -EINVAL;
5991 buffer = endp + 1; 5986 buffer = endp + 1;
5992 5987
5993 cfd = simple_strtoul(buffer, &endp, 10); 5988 cfd = simple_strtoul(buffer, &endp, 10);
5994 if ((*endp != ' ') && (*endp != '\0')) 5989 if ((*endp != ' ') && (*endp != '\0'))
5995 return -EINVAL; 5990 return -EINVAL;
5996 buffer = endp + 1; 5991 buffer = endp + 1;
5997 5992
5998 event = kzalloc(sizeof(*event), GFP_KERNEL); 5993 event = kzalloc(sizeof(*event), GFP_KERNEL);
5999 if (!event) 5994 if (!event)
6000 return -ENOMEM; 5995 return -ENOMEM;
6001 5996
6002 event->memcg = memcg; 5997 event->memcg = memcg;
6003 INIT_LIST_HEAD(&event->list); 5998 INIT_LIST_HEAD(&event->list);
6004 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc); 5999 init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
6005 init_waitqueue_func_entry(&event->wait, memcg_event_wake); 6000 init_waitqueue_func_entry(&event->wait, memcg_event_wake);
6006 INIT_WORK(&event->remove, memcg_event_remove); 6001 INIT_WORK(&event->remove, memcg_event_remove);
6007 6002
6008 efile = fdget(efd); 6003 efile = fdget(efd);
6009 if (!efile.file) { 6004 if (!efile.file) {
6010 ret = -EBADF; 6005 ret = -EBADF;
6011 goto out_kfree; 6006 goto out_kfree;
6012 } 6007 }
6013 6008
6014 event->eventfd = eventfd_ctx_fileget(efile.file); 6009 event->eventfd = eventfd_ctx_fileget(efile.file);
6015 if (IS_ERR(event->eventfd)) { 6010 if (IS_ERR(event->eventfd)) {
6016 ret = PTR_ERR(event->eventfd); 6011 ret = PTR_ERR(event->eventfd);
6017 goto out_put_efile; 6012 goto out_put_efile;
6018 } 6013 }
6019 6014
6020 cfile = fdget(cfd); 6015 cfile = fdget(cfd);
6021 if (!cfile.file) { 6016 if (!cfile.file) {
6022 ret = -EBADF; 6017 ret = -EBADF;
6023 goto out_put_eventfd; 6018 goto out_put_eventfd;
6024 } 6019 }
6025 6020
6026 /* the process need read permission on control file */ 6021 /* the process need read permission on control file */
6027 /* AV: shouldn't we check that it's been opened for read instead? */ 6022 /* AV: shouldn't we check that it's been opened for read instead? */
6028 ret = inode_permission(file_inode(cfile.file), MAY_READ); 6023 ret = inode_permission(file_inode(cfile.file), MAY_READ);
6029 if (ret < 0) 6024 if (ret < 0)
6030 goto out_put_cfile; 6025 goto out_put_cfile;
6031 6026
6032 /* 6027 /*
6033 * Determine the event callbacks and set them in @event. This used 6028 * Determine the event callbacks and set them in @event. This used
6034 * to be done via struct cftype but cgroup core no longer knows 6029 * to be done via struct cftype but cgroup core no longer knows
6035 * about these events. The following is crude but the whole thing 6030 * about these events. The following is crude but the whole thing
6036 * is for compatibility anyway. 6031 * is for compatibility anyway.
6037 * 6032 *
6038 * DO NOT ADD NEW FILES. 6033 * DO NOT ADD NEW FILES.
6039 */ 6034 */
6040 name = cfile.file->f_dentry->d_name.name; 6035 name = cfile.file->f_dentry->d_name.name;
6041 6036
6042 if (!strcmp(name, "memory.usage_in_bytes")) { 6037 if (!strcmp(name, "memory.usage_in_bytes")) {
6043 event->register_event = mem_cgroup_usage_register_event; 6038 event->register_event = mem_cgroup_usage_register_event;
6044 event->unregister_event = mem_cgroup_usage_unregister_event; 6039 event->unregister_event = mem_cgroup_usage_unregister_event;
6045 } else if (!strcmp(name, "memory.oom_control")) { 6040 } else if (!strcmp(name, "memory.oom_control")) {
6046 event->register_event = mem_cgroup_oom_register_event; 6041 event->register_event = mem_cgroup_oom_register_event;
6047 event->unregister_event = mem_cgroup_oom_unregister_event; 6042 event->unregister_event = mem_cgroup_oom_unregister_event;
6048 } else if (!strcmp(name, "memory.pressure_level")) { 6043 } else if (!strcmp(name, "memory.pressure_level")) {
6049 event->register_event = vmpressure_register_event; 6044 event->register_event = vmpressure_register_event;
6050 event->unregister_event = vmpressure_unregister_event; 6045 event->unregister_event = vmpressure_unregister_event;
6051 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) { 6046 } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
6052 event->register_event = memsw_cgroup_usage_register_event; 6047 event->register_event = memsw_cgroup_usage_register_event;
6053 event->unregister_event = memsw_cgroup_usage_unregister_event; 6048 event->unregister_event = memsw_cgroup_usage_unregister_event;
6054 } else { 6049 } else {
6055 ret = -EINVAL; 6050 ret = -EINVAL;
6056 goto out_put_cfile; 6051 goto out_put_cfile;
6057 } 6052 }
6058 6053
6059 /* 6054 /*
6060 * Verify @cfile should belong to @css. Also, remaining events are 6055 * Verify @cfile should belong to @css. Also, remaining events are
6061 * automatically removed on cgroup destruction but the removal is 6056 * automatically removed on cgroup destruction but the removal is
6062 * asynchronous, so take an extra ref on @css. 6057 * asynchronous, so take an extra ref on @css.
6063 */ 6058 */
6064 cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, 6059 cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent,
6065 &memory_cgrp_subsys); 6060 &memory_cgrp_subsys);
6066 ret = -EINVAL; 6061 ret = -EINVAL;
6067 if (IS_ERR(cfile_css)) 6062 if (IS_ERR(cfile_css))
6068 goto out_put_cfile; 6063 goto out_put_cfile;
6069 if (cfile_css != css) { 6064 if (cfile_css != css) {
6070 css_put(cfile_css); 6065 css_put(cfile_css);
6071 goto out_put_cfile; 6066 goto out_put_cfile;
6072 } 6067 }
6073 6068
6074 ret = event->register_event(memcg, event->eventfd, buffer); 6069 ret = event->register_event(memcg, event->eventfd, buffer);
6075 if (ret) 6070 if (ret)
6076 goto out_put_css; 6071 goto out_put_css;
6077 6072
6078 efile.file->f_op->poll(efile.file, &event->pt); 6073 efile.file->f_op->poll(efile.file, &event->pt);
6079 6074
6080 spin_lock(&memcg->event_list_lock); 6075 spin_lock(&memcg->event_list_lock);
6081 list_add(&event->list, &memcg->event_list); 6076 list_add(&event->list, &memcg->event_list);
6082 spin_unlock(&memcg->event_list_lock); 6077 spin_unlock(&memcg->event_list_lock);
6083 6078
6084 fdput(cfile); 6079 fdput(cfile);
6085 fdput(efile); 6080 fdput(efile);
6086 6081
6087 return 0; 6082 return 0;
6088 6083
6089 out_put_css: 6084 out_put_css:
6090 css_put(css); 6085 css_put(css);
6091 out_put_cfile: 6086 out_put_cfile:
6092 fdput(cfile); 6087 fdput(cfile);
6093 out_put_eventfd: 6088 out_put_eventfd:
6094 eventfd_ctx_put(event->eventfd); 6089 eventfd_ctx_put(event->eventfd);
6095 out_put_efile: 6090 out_put_efile:
6096 fdput(efile); 6091 fdput(efile);
6097 out_kfree: 6092 out_kfree:
6098 kfree(event); 6093 kfree(event);
6099 6094
6100 return ret; 6095 return ret;
6101 } 6096 }
6102 6097
6103 static struct cftype mem_cgroup_files[] = { 6098 static struct cftype mem_cgroup_files[] = {
6104 { 6099 {
6105 .name = "usage_in_bytes", 6100 .name = "usage_in_bytes",
6106 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 6101 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
6107 .read_u64 = mem_cgroup_read_u64, 6102 .read_u64 = mem_cgroup_read_u64,
6108 }, 6103 },
6109 { 6104 {
6110 .name = "max_usage_in_bytes", 6105 .name = "max_usage_in_bytes",
6111 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 6106 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
6112 .trigger = mem_cgroup_reset, 6107 .trigger = mem_cgroup_reset,
6113 .read_u64 = mem_cgroup_read_u64, 6108 .read_u64 = mem_cgroup_read_u64,
6114 }, 6109 },
6115 { 6110 {
6116 .name = "limit_in_bytes", 6111 .name = "limit_in_bytes",
6117 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 6112 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
6118 .write_string = mem_cgroup_write, 6113 .write_string = mem_cgroup_write,
6119 .read_u64 = mem_cgroup_read_u64, 6114 .read_u64 = mem_cgroup_read_u64,
6120 }, 6115 },
6121 { 6116 {
6122 .name = "soft_limit_in_bytes", 6117 .name = "soft_limit_in_bytes",
6123 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6118 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
6124 .write_string = mem_cgroup_write, 6119 .write_string = mem_cgroup_write,
6125 .read_u64 = mem_cgroup_read_u64, 6120 .read_u64 = mem_cgroup_read_u64,
6126 }, 6121 },
6127 { 6122 {
6128 .name = "failcnt", 6123 .name = "failcnt",
6129 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6124 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
6130 .trigger = mem_cgroup_reset, 6125 .trigger = mem_cgroup_reset,
6131 .read_u64 = mem_cgroup_read_u64, 6126 .read_u64 = mem_cgroup_read_u64,
6132 }, 6127 },
6133 { 6128 {
6134 .name = "stat", 6129 .name = "stat",
6135 .seq_show = memcg_stat_show, 6130 .seq_show = memcg_stat_show,
6136 }, 6131 },
6137 { 6132 {
6138 .name = "force_empty", 6133 .name = "force_empty",
6139 .trigger = mem_cgroup_force_empty_write, 6134 .trigger = mem_cgroup_force_empty_write,
6140 }, 6135 },
6141 { 6136 {
6142 .name = "use_hierarchy", 6137 .name = "use_hierarchy",
6143 .flags = CFTYPE_INSANE, 6138 .flags = CFTYPE_INSANE,
6144 .write_u64 = mem_cgroup_hierarchy_write, 6139 .write_u64 = mem_cgroup_hierarchy_write,
6145 .read_u64 = mem_cgroup_hierarchy_read, 6140 .read_u64 = mem_cgroup_hierarchy_read,
6146 }, 6141 },
6147 { 6142 {
6148 .name = "cgroup.event_control", /* XXX: for compat */ 6143 .name = "cgroup.event_control", /* XXX: for compat */
6149 .write_string = memcg_write_event_control, 6144 .write_string = memcg_write_event_control,
6150 .flags = CFTYPE_NO_PREFIX, 6145 .flags = CFTYPE_NO_PREFIX,
6151 .mode = S_IWUGO, 6146 .mode = S_IWUGO,
6152 }, 6147 },
6153 { 6148 {
6154 .name = "swappiness", 6149 .name = "swappiness",
6155 .read_u64 = mem_cgroup_swappiness_read, 6150 .read_u64 = mem_cgroup_swappiness_read,
6156 .write_u64 = mem_cgroup_swappiness_write, 6151 .write_u64 = mem_cgroup_swappiness_write,
6157 }, 6152 },
6158 { 6153 {
6159 .name = "move_charge_at_immigrate", 6154 .name = "move_charge_at_immigrate",
6160 .read_u64 = mem_cgroup_move_charge_read, 6155 .read_u64 = mem_cgroup_move_charge_read,
6161 .write_u64 = mem_cgroup_move_charge_write, 6156 .write_u64 = mem_cgroup_move_charge_write,
6162 }, 6157 },
6163 { 6158 {
6164 .name = "oom_control", 6159 .name = "oom_control",
6165 .seq_show = mem_cgroup_oom_control_read, 6160 .seq_show = mem_cgroup_oom_control_read,
6166 .write_u64 = mem_cgroup_oom_control_write, 6161 .write_u64 = mem_cgroup_oom_control_write,
6167 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6162 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
6168 }, 6163 },
6169 { 6164 {
6170 .name = "pressure_level", 6165 .name = "pressure_level",
6171 }, 6166 },
6172 #ifdef CONFIG_NUMA 6167 #ifdef CONFIG_NUMA
6173 { 6168 {
6174 .name = "numa_stat", 6169 .name = "numa_stat",
6175 .seq_show = memcg_numa_stat_show, 6170 .seq_show = memcg_numa_stat_show,
6176 }, 6171 },
6177 #endif 6172 #endif
6178 #ifdef CONFIG_MEMCG_KMEM 6173 #ifdef CONFIG_MEMCG_KMEM
6179 { 6174 {
6180 .name = "kmem.limit_in_bytes", 6175 .name = "kmem.limit_in_bytes",
6181 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6176 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
6182 .write_string = mem_cgroup_write, 6177 .write_string = mem_cgroup_write,
6183 .read_u64 = mem_cgroup_read_u64, 6178 .read_u64 = mem_cgroup_read_u64,
6184 }, 6179 },
6185 { 6180 {
6186 .name = "kmem.usage_in_bytes", 6181 .name = "kmem.usage_in_bytes",
6187 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6182 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
6188 .read_u64 = mem_cgroup_read_u64, 6183 .read_u64 = mem_cgroup_read_u64,
6189 }, 6184 },
6190 { 6185 {
6191 .name = "kmem.failcnt", 6186 .name = "kmem.failcnt",
6192 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6187 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
6193 .trigger = mem_cgroup_reset, 6188 .trigger = mem_cgroup_reset,
6194 .read_u64 = mem_cgroup_read_u64, 6189 .read_u64 = mem_cgroup_read_u64,
6195 }, 6190 },
6196 { 6191 {
6197 .name = "kmem.max_usage_in_bytes", 6192 .name = "kmem.max_usage_in_bytes",
6198 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6193 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
6199 .trigger = mem_cgroup_reset, 6194 .trigger = mem_cgroup_reset,
6200 .read_u64 = mem_cgroup_read_u64, 6195 .read_u64 = mem_cgroup_read_u64,
6201 }, 6196 },
6202 #ifdef CONFIG_SLABINFO 6197 #ifdef CONFIG_SLABINFO
6203 { 6198 {
6204 .name = "kmem.slabinfo", 6199 .name = "kmem.slabinfo",
6205 .seq_show = mem_cgroup_slabinfo_read, 6200 .seq_show = mem_cgroup_slabinfo_read,
6206 }, 6201 },
6207 #endif 6202 #endif
6208 #endif 6203 #endif
6209 { }, /* terminate */ 6204 { }, /* terminate */
6210 }; 6205 };
6211 6206
6212 #ifdef CONFIG_MEMCG_SWAP 6207 #ifdef CONFIG_MEMCG_SWAP
6213 static struct cftype memsw_cgroup_files[] = { 6208 static struct cftype memsw_cgroup_files[] = {
6214 { 6209 {
6215 .name = "memsw.usage_in_bytes", 6210 .name = "memsw.usage_in_bytes",
6216 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6211 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6217 .read_u64 = mem_cgroup_read_u64, 6212 .read_u64 = mem_cgroup_read_u64,
6218 }, 6213 },
6219 { 6214 {
6220 .name = "memsw.max_usage_in_bytes", 6215 .name = "memsw.max_usage_in_bytes",
6221 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6216 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6222 .trigger = mem_cgroup_reset, 6217 .trigger = mem_cgroup_reset,
6223 .read_u64 = mem_cgroup_read_u64, 6218 .read_u64 = mem_cgroup_read_u64,
6224 }, 6219 },
6225 { 6220 {
6226 .name = "memsw.limit_in_bytes", 6221 .name = "memsw.limit_in_bytes",
6227 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6222 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6228 .write_string = mem_cgroup_write, 6223 .write_string = mem_cgroup_write,
6229 .read_u64 = mem_cgroup_read_u64, 6224 .read_u64 = mem_cgroup_read_u64,
6230 }, 6225 },
6231 { 6226 {
6232 .name = "memsw.failcnt", 6227 .name = "memsw.failcnt",
6233 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6228 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6234 .trigger = mem_cgroup_reset, 6229 .trigger = mem_cgroup_reset,
6235 .read_u64 = mem_cgroup_read_u64, 6230 .read_u64 = mem_cgroup_read_u64,
6236 }, 6231 },
6237 { }, /* terminate */ 6232 { }, /* terminate */
6238 }; 6233 };
6239 #endif 6234 #endif
6240 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6235 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6241 { 6236 {
6242 struct mem_cgroup_per_node *pn; 6237 struct mem_cgroup_per_node *pn;
6243 struct mem_cgroup_per_zone *mz; 6238 struct mem_cgroup_per_zone *mz;
6244 int zone, tmp = node; 6239 int zone, tmp = node;
6245 /* 6240 /*
6246 * This routine is called against possible nodes. 6241 * This routine is called against possible nodes.
6247 * But it's BUG to call kmalloc() against offline node. 6242 * But it's BUG to call kmalloc() against offline node.
6248 * 6243 *
6249 * TODO: this routine can waste much memory for nodes which will 6244 * TODO: this routine can waste much memory for nodes which will
6250 * never be onlined. It's better to use memory hotplug callback 6245 * never be onlined. It's better to use memory hotplug callback
6251 * function. 6246 * function.
6252 */ 6247 */
6253 if (!node_state(node, N_NORMAL_MEMORY)) 6248 if (!node_state(node, N_NORMAL_MEMORY))
6254 tmp = -1; 6249 tmp = -1;
6255 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 6250 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
6256 if (!pn) 6251 if (!pn)
6257 return 1; 6252 return 1;
6258 6253
6259 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6254 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6260 mz = &pn->zoneinfo[zone]; 6255 mz = &pn->zoneinfo[zone];
6261 lruvec_init(&mz->lruvec); 6256 lruvec_init(&mz->lruvec);
6262 mz->usage_in_excess = 0; 6257 mz->usage_in_excess = 0;
6263 mz->on_tree = false; 6258 mz->on_tree = false;
6264 mz->memcg = memcg; 6259 mz->memcg = memcg;
6265 } 6260 }
6266 memcg->nodeinfo[node] = pn; 6261 memcg->nodeinfo[node] = pn;
6267 return 0; 6262 return 0;
6268 } 6263 }
6269 6264
6270 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6265 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
6271 { 6266 {
6272 kfree(memcg->nodeinfo[node]); 6267 kfree(memcg->nodeinfo[node]);
6273 } 6268 }
6274 6269
6275 static struct mem_cgroup *mem_cgroup_alloc(void) 6270 static struct mem_cgroup *mem_cgroup_alloc(void)
6276 { 6271 {
6277 struct mem_cgroup *memcg; 6272 struct mem_cgroup *memcg;
6278 size_t size; 6273 size_t size;
6279 6274
6280 size = sizeof(struct mem_cgroup); 6275 size = sizeof(struct mem_cgroup);
6281 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); 6276 size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
6282 6277
6283 memcg = kzalloc(size, GFP_KERNEL); 6278 memcg = kzalloc(size, GFP_KERNEL);
6284 if (!memcg) 6279 if (!memcg)
6285 return NULL; 6280 return NULL;
6286 6281
6287 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 6282 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
6288 if (!memcg->stat) 6283 if (!memcg->stat)
6289 goto out_free; 6284 goto out_free;
6290 spin_lock_init(&memcg->pcp_counter_lock); 6285 spin_lock_init(&memcg->pcp_counter_lock);
6291 return memcg; 6286 return memcg;
6292 6287
6293 out_free: 6288 out_free:
6294 kfree(memcg); 6289 kfree(memcg);
6295 return NULL; 6290 return NULL;
6296 } 6291 }
6297 6292
6298 /* 6293 /*
6299 * At destroying mem_cgroup, references from swap_cgroup can remain. 6294 * At destroying mem_cgroup, references from swap_cgroup can remain.
6300 * (scanning all at force_empty is too costly...) 6295 * (scanning all at force_empty is too costly...)
6301 * 6296 *
6302 * Instead of clearing all references at force_empty, we remember 6297 * Instead of clearing all references at force_empty, we remember
6303 * the number of reference from swap_cgroup and free mem_cgroup when 6298 * the number of reference from swap_cgroup and free mem_cgroup when
6304 * it goes down to 0. 6299 * it goes down to 0.
6305 * 6300 *
6306 * Removal of cgroup itself succeeds regardless of refs from swap. 6301 * Removal of cgroup itself succeeds regardless of refs from swap.
6307 */ 6302 */
6308 6303
6309 static void __mem_cgroup_free(struct mem_cgroup *memcg) 6304 static void __mem_cgroup_free(struct mem_cgroup *memcg)
6310 { 6305 {
6311 int node; 6306 int node;
6312 6307
6313 mem_cgroup_remove_from_trees(memcg); 6308 mem_cgroup_remove_from_trees(memcg);
6314 6309
6315 for_each_node(node) 6310 for_each_node(node)
6316 free_mem_cgroup_per_zone_info(memcg, node); 6311 free_mem_cgroup_per_zone_info(memcg, node);
6317 6312
6318 free_percpu(memcg->stat); 6313 free_percpu(memcg->stat);
6319 6314
6320 /* 6315 /*
6321 * We need to make sure that (at least for now), the jump label 6316 * We need to make sure that (at least for now), the jump label
6322 * destruction code runs outside of the cgroup lock. This is because 6317 * destruction code runs outside of the cgroup lock. This is because
6323 * get_online_cpus(), which is called from the static_branch update, 6318 * get_online_cpus(), which is called from the static_branch update,
6324 * can't be called inside the cgroup_lock. cpusets are the ones 6319 * can't be called inside the cgroup_lock. cpusets are the ones
6325 * enforcing this dependency, so if they ever change, we might as well. 6320 * enforcing this dependency, so if they ever change, we might as well.
6326 * 6321 *
6327 * schedule_work() will guarantee this happens. Be careful if you need 6322 * schedule_work() will guarantee this happens. Be careful if you need
6328 * to move this code around, and make sure it is outside 6323 * to move this code around, and make sure it is outside
6329 * the cgroup_lock. 6324 * the cgroup_lock.
6330 */ 6325 */
6331 disarm_static_keys(memcg); 6326 disarm_static_keys(memcg);
6332 kfree(memcg); 6327 kfree(memcg);
6333 } 6328 }
6334 6329
6335 /* 6330 /*
6336 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 6331 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
6337 */ 6332 */
6338 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 6333 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
6339 { 6334 {
6340 if (!memcg->res.parent) 6335 if (!memcg->res.parent)
6341 return NULL; 6336 return NULL;
6342 return mem_cgroup_from_res_counter(memcg->res.parent, res); 6337 return mem_cgroup_from_res_counter(memcg->res.parent, res);
6343 } 6338 }
6344 EXPORT_SYMBOL(parent_mem_cgroup); 6339 EXPORT_SYMBOL(parent_mem_cgroup);
6345 6340
6346 static void __init mem_cgroup_soft_limit_tree_init(void) 6341 static void __init mem_cgroup_soft_limit_tree_init(void)
6347 { 6342 {
6348 struct mem_cgroup_tree_per_node *rtpn; 6343 struct mem_cgroup_tree_per_node *rtpn;
6349 struct mem_cgroup_tree_per_zone *rtpz; 6344 struct mem_cgroup_tree_per_zone *rtpz;
6350 int tmp, node, zone; 6345 int tmp, node, zone;
6351 6346
6352 for_each_node(node) { 6347 for_each_node(node) {
6353 tmp = node; 6348 tmp = node;
6354 if (!node_state(node, N_NORMAL_MEMORY)) 6349 if (!node_state(node, N_NORMAL_MEMORY))
6355 tmp = -1; 6350 tmp = -1;
6356 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6351 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
6357 BUG_ON(!rtpn); 6352 BUG_ON(!rtpn);
6358 6353
6359 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6354 soft_limit_tree.rb_tree_per_node[node] = rtpn;
6360 6355
6361 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6356 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
6362 rtpz = &rtpn->rb_tree_per_zone[zone]; 6357 rtpz = &rtpn->rb_tree_per_zone[zone];
6363 rtpz->rb_root = RB_ROOT; 6358 rtpz->rb_root = RB_ROOT;
6364 spin_lock_init(&rtpz->lock); 6359 spin_lock_init(&rtpz->lock);
6365 } 6360 }
6366 } 6361 }
6367 } 6362 }
6368 6363
6369 static struct cgroup_subsys_state * __ref 6364 static struct cgroup_subsys_state * __ref
6370 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6365 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6371 { 6366 {
6372 struct mem_cgroup *memcg; 6367 struct mem_cgroup *memcg;
6373 long error = -ENOMEM; 6368 long error = -ENOMEM;
6374 int node; 6369 int node;
6375 6370
6376 memcg = mem_cgroup_alloc(); 6371 memcg = mem_cgroup_alloc();
6377 if (!memcg) 6372 if (!memcg)
6378 return ERR_PTR(error); 6373 return ERR_PTR(error);
6379 6374
6380 for_each_node(node) 6375 for_each_node(node)
6381 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 6376 if (alloc_mem_cgroup_per_zone_info(memcg, node))
6382 goto free_out; 6377 goto free_out;
6383 6378
6384 /* root ? */ 6379 /* root ? */
6385 if (parent_css == NULL) { 6380 if (parent_css == NULL) {
6386 root_mem_cgroup = memcg; 6381 root_mem_cgroup = memcg;
6387 res_counter_init(&memcg->res, NULL); 6382 res_counter_init(&memcg->res, NULL);
6388 res_counter_init(&memcg->memsw, NULL); 6383 res_counter_init(&memcg->memsw, NULL);
6389 res_counter_init(&memcg->kmem, NULL); 6384 res_counter_init(&memcg->kmem, NULL);
6390 } 6385 }
6391 6386
6392 memcg->last_scanned_node = MAX_NUMNODES; 6387 memcg->last_scanned_node = MAX_NUMNODES;
6393 INIT_LIST_HEAD(&memcg->oom_notify); 6388 INIT_LIST_HEAD(&memcg->oom_notify);
6394 memcg->move_charge_at_immigrate = 0; 6389 memcg->move_charge_at_immigrate = 0;
6395 mutex_init(&memcg->thresholds_lock); 6390 mutex_init(&memcg->thresholds_lock);
6396 spin_lock_init(&memcg->move_lock); 6391 spin_lock_init(&memcg->move_lock);
6397 vmpressure_init(&memcg->vmpressure); 6392 vmpressure_init(&memcg->vmpressure);
6398 INIT_LIST_HEAD(&memcg->event_list); 6393 INIT_LIST_HEAD(&memcg->event_list);
6399 spin_lock_init(&memcg->event_list_lock); 6394 spin_lock_init(&memcg->event_list_lock);
6400 6395
6401 return &memcg->css; 6396 return &memcg->css;
6402 6397
6403 free_out: 6398 free_out:
6404 __mem_cgroup_free(memcg); 6399 __mem_cgroup_free(memcg);
6405 return ERR_PTR(error); 6400 return ERR_PTR(error);
6406 } 6401 }
6407 6402
6408 static int 6403 static int
6409 mem_cgroup_css_online(struct cgroup_subsys_state *css) 6404 mem_cgroup_css_online(struct cgroup_subsys_state *css)
6410 { 6405 {
6411 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6406 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6412 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); 6407 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
6413 6408
6414 if (css->cgroup->id > MEM_CGROUP_ID_MAX) 6409 if (css->cgroup->id > MEM_CGROUP_ID_MAX)
6415 return -ENOSPC; 6410 return -ENOSPC;
6416 6411
6417 if (!parent) 6412 if (!parent)
6418 return 0; 6413 return 0;
6419 6414
6420 mutex_lock(&memcg_create_mutex); 6415 mutex_lock(&memcg_create_mutex);
6421 6416
6422 memcg->use_hierarchy = parent->use_hierarchy; 6417 memcg->use_hierarchy = parent->use_hierarchy;
6423 memcg->oom_kill_disable = parent->oom_kill_disable; 6418 memcg->oom_kill_disable = parent->oom_kill_disable;
6424 memcg->swappiness = mem_cgroup_swappiness(parent); 6419 memcg->swappiness = mem_cgroup_swappiness(parent);
6425 6420
6426 if (parent->use_hierarchy) { 6421 if (parent->use_hierarchy) {
6427 res_counter_init(&memcg->res, &parent->res); 6422 res_counter_init(&memcg->res, &parent->res);
6428 res_counter_init(&memcg->memsw, &parent->memsw); 6423 res_counter_init(&memcg->memsw, &parent->memsw);
6429 res_counter_init(&memcg->kmem, &parent->kmem); 6424 res_counter_init(&memcg->kmem, &parent->kmem);
6430 6425
6431 /* 6426 /*
6432 * No need to take a reference to the parent because cgroup 6427 * No need to take a reference to the parent because cgroup
6433 * core guarantees its existence. 6428 * core guarantees its existence.
6434 */ 6429 */
6435 } else { 6430 } else {
6436 res_counter_init(&memcg->res, NULL); 6431 res_counter_init(&memcg->res, NULL);
6437 res_counter_init(&memcg->memsw, NULL); 6432 res_counter_init(&memcg->memsw, NULL);
6438 res_counter_init(&memcg->kmem, NULL); 6433 res_counter_init(&memcg->kmem, NULL);
6439 /* 6434 /*
6440 * Deeper hierachy with use_hierarchy == false doesn't make 6435 * Deeper hierachy with use_hierarchy == false doesn't make
6441 * much sense so let cgroup subsystem know about this 6436 * much sense so let cgroup subsystem know about this
6442 * unfortunate state in our controller. 6437 * unfortunate state in our controller.
6443 */ 6438 */
6444 if (parent != root_mem_cgroup) 6439 if (parent != root_mem_cgroup)
6445 memory_cgrp_subsys.broken_hierarchy = true; 6440 memory_cgrp_subsys.broken_hierarchy = true;
6446 } 6441 }
6447 mutex_unlock(&memcg_create_mutex); 6442 mutex_unlock(&memcg_create_mutex);
6448 6443
6449 return memcg_init_kmem(memcg, &memory_cgrp_subsys); 6444 return memcg_init_kmem(memcg, &memory_cgrp_subsys);
6450 } 6445 }
6451 6446
6452 /* 6447 /*
6453 * Announce all parents that a group from their hierarchy is gone. 6448 * Announce all parents that a group from their hierarchy is gone.
6454 */ 6449 */
6455 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) 6450 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
6456 { 6451 {
6457 struct mem_cgroup *parent = memcg; 6452 struct mem_cgroup *parent = memcg;
6458 6453
6459 while ((parent = parent_mem_cgroup(parent))) 6454 while ((parent = parent_mem_cgroup(parent)))
6460 mem_cgroup_iter_invalidate(parent); 6455 mem_cgroup_iter_invalidate(parent);
6461 6456
6462 /* 6457 /*
6463 * if the root memcg is not hierarchical we have to check it 6458 * if the root memcg is not hierarchical we have to check it
6464 * explicitely. 6459 * explicitely.
6465 */ 6460 */
6466 if (!root_mem_cgroup->use_hierarchy) 6461 if (!root_mem_cgroup->use_hierarchy)
6467 mem_cgroup_iter_invalidate(root_mem_cgroup); 6462 mem_cgroup_iter_invalidate(root_mem_cgroup);
6468 } 6463 }
6469 6464
6470 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6465 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
6471 { 6466 {
6472 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6467 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6473 struct mem_cgroup_event *event, *tmp; 6468 struct mem_cgroup_event *event, *tmp;
6474 struct cgroup_subsys_state *iter; 6469 struct cgroup_subsys_state *iter;
6475 6470
6476 /* 6471 /*
6477 * Unregister events and notify userspace. 6472 * Unregister events and notify userspace.
6478 * Notify userspace about cgroup removing only after rmdir of cgroup 6473 * Notify userspace about cgroup removing only after rmdir of cgroup
6479 * directory to avoid race between userspace and kernelspace. 6474 * directory to avoid race between userspace and kernelspace.
6480 */ 6475 */
6481 spin_lock(&memcg->event_list_lock); 6476 spin_lock(&memcg->event_list_lock);
6482 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) { 6477 list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
6483 list_del_init(&event->list); 6478 list_del_init(&event->list);
6484 schedule_work(&event->remove); 6479 schedule_work(&event->remove);
6485 } 6480 }
6486 spin_unlock(&memcg->event_list_lock); 6481 spin_unlock(&memcg->event_list_lock);
6487 6482
6488 kmem_cgroup_css_offline(memcg); 6483 kmem_cgroup_css_offline(memcg);
6489 6484
6490 mem_cgroup_invalidate_reclaim_iterators(memcg); 6485 mem_cgroup_invalidate_reclaim_iterators(memcg);
6491 6486
6492 /* 6487 /*
6493 * This requires that offlining is serialized. Right now that is 6488 * This requires that offlining is serialized. Right now that is
6494 * guaranteed because css_killed_work_fn() holds the cgroup_mutex. 6489 * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
6495 */ 6490 */
6496 css_for_each_descendant_post(iter, css) 6491 css_for_each_descendant_post(iter, css)
6497 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); 6492 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
6498 6493
6499 mem_cgroup_destroy_all_caches(memcg); 6494 mem_cgroup_destroy_all_caches(memcg);
6500 vmpressure_cleanup(&memcg->vmpressure); 6495 vmpressure_cleanup(&memcg->vmpressure);
6501 } 6496 }
6502 6497
6503 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 6498 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
6504 { 6499 {
6505 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6500 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6506 /* 6501 /*
6507 * XXX: css_offline() would be where we should reparent all 6502 * XXX: css_offline() would be where we should reparent all
6508 * memory to prepare the cgroup for destruction. However, 6503 * memory to prepare the cgroup for destruction. However,
6509 * memcg does not do css_tryget() and res_counter charging 6504 * memcg does not do css_tryget() and res_counter charging
6510 * under the same RCU lock region, which means that charging 6505 * under the same RCU lock region, which means that charging
6511 * could race with offlining. Offlining only happens to 6506 * could race with offlining. Offlining only happens to
6512 * cgroups with no tasks in them but charges can show up 6507 * cgroups with no tasks in them but charges can show up
6513 * without any tasks from the swapin path when the target 6508 * without any tasks from the swapin path when the target
6514 * memcg is looked up from the swapout record and not from the 6509 * memcg is looked up from the swapout record and not from the
6515 * current task as it usually is. A race like this can leak 6510 * current task as it usually is. A race like this can leak
6516 * charges and put pages with stale cgroup pointers into 6511 * charges and put pages with stale cgroup pointers into
6517 * circulation: 6512 * circulation:
6518 * 6513 *
6519 * #0 #1 6514 * #0 #1
6520 * lookup_swap_cgroup_id() 6515 * lookup_swap_cgroup_id()
6521 * rcu_read_lock() 6516 * rcu_read_lock()
6522 * mem_cgroup_lookup() 6517 * mem_cgroup_lookup()
6523 * css_tryget() 6518 * css_tryget()
6524 * rcu_read_unlock() 6519 * rcu_read_unlock()
6525 * disable css_tryget() 6520 * disable css_tryget()
6526 * call_rcu() 6521 * call_rcu()
6527 * offline_css() 6522 * offline_css()
6528 * reparent_charges() 6523 * reparent_charges()
6529 * res_counter_charge() 6524 * res_counter_charge()
6530 * css_put() 6525 * css_put()
6531 * css_free() 6526 * css_free()
6532 * pc->mem_cgroup = dead memcg 6527 * pc->mem_cgroup = dead memcg
6533 * add page to lru 6528 * add page to lru
6534 * 6529 *
6535 * The bulk of the charges are still moved in offline_css() to 6530 * The bulk of the charges are still moved in offline_css() to
6536 * avoid pinning a lot of pages in case a long-term reference 6531 * avoid pinning a lot of pages in case a long-term reference
6537 * like a swapout record is deferring the css_free() to long 6532 * like a swapout record is deferring the css_free() to long
6538 * after offlining. But this makes sure we catch any charges 6533 * after offlining. But this makes sure we catch any charges
6539 * made after offlining: 6534 * made after offlining:
6540 */ 6535 */
6541 mem_cgroup_reparent_charges(memcg); 6536 mem_cgroup_reparent_charges(memcg);
6542 6537
6543 memcg_destroy_kmem(memcg); 6538 memcg_destroy_kmem(memcg);
6544 __mem_cgroup_free(memcg); 6539 __mem_cgroup_free(memcg);
6545 } 6540 }
6546 6541
6547 #ifdef CONFIG_MMU 6542 #ifdef CONFIG_MMU
6548 /* Handlers for move charge at task migration. */ 6543 /* Handlers for move charge at task migration. */
6549 #define PRECHARGE_COUNT_AT_ONCE 256 6544 #define PRECHARGE_COUNT_AT_ONCE 256
6550 static int mem_cgroup_do_precharge(unsigned long count) 6545 static int mem_cgroup_do_precharge(unsigned long count)
6551 { 6546 {
6552 int ret = 0; 6547 int ret = 0;
6553 int batch_count = PRECHARGE_COUNT_AT_ONCE; 6548 int batch_count = PRECHARGE_COUNT_AT_ONCE;
6554 struct mem_cgroup *memcg = mc.to; 6549 struct mem_cgroup *memcg = mc.to;
6555 6550
6556 if (mem_cgroup_is_root(memcg)) { 6551 if (mem_cgroup_is_root(memcg)) {
6557 mc.precharge += count; 6552 mc.precharge += count;
6558 /* we don't need css_get for root */ 6553 /* we don't need css_get for root */
6559 return ret; 6554 return ret;
6560 } 6555 }
6561 /* try to charge at once */ 6556 /* try to charge at once */
6562 if (count > 1) { 6557 if (count > 1) {
6563 struct res_counter *dummy; 6558 struct res_counter *dummy;
6564 /* 6559 /*
6565 * "memcg" cannot be under rmdir() because we've already checked 6560 * "memcg" cannot be under rmdir() because we've already checked
6566 * by cgroup_lock_live_cgroup() that it is not removed and we 6561 * by cgroup_lock_live_cgroup() that it is not removed and we
6567 * are still under the same cgroup_mutex. So we can postpone 6562 * are still under the same cgroup_mutex. So we can postpone
6568 * css_get(). 6563 * css_get().
6569 */ 6564 */
6570 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 6565 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
6571 goto one_by_one; 6566 goto one_by_one;
6572 if (do_swap_account && res_counter_charge(&memcg->memsw, 6567 if (do_swap_account && res_counter_charge(&memcg->memsw,
6573 PAGE_SIZE * count, &dummy)) { 6568 PAGE_SIZE * count, &dummy)) {
6574 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 6569 res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
6575 goto one_by_one; 6570 goto one_by_one;
6576 } 6571 }
6577 mc.precharge += count; 6572 mc.precharge += count;
6578 return ret; 6573 return ret;
6579 } 6574 }
6580 one_by_one: 6575 one_by_one:
6581 /* fall back to one by one charge */ 6576 /* fall back to one by one charge */
6582 while (count--) { 6577 while (count--) {
6583 if (signal_pending(current)) { 6578 if (signal_pending(current)) {
6584 ret = -EINTR; 6579 ret = -EINTR;
6585 break; 6580 break;
6586 } 6581 }
6587 if (!batch_count--) { 6582 if (!batch_count--) {
6588 batch_count = PRECHARGE_COUNT_AT_ONCE; 6583 batch_count = PRECHARGE_COUNT_AT_ONCE;
6589 cond_resched(); 6584 cond_resched();
6590 } 6585 }
6591 ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); 6586 ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false);
6592 if (ret) 6587 if (ret)
6593 /* mem_cgroup_clear_mc() will do uncharge later */ 6588 /* mem_cgroup_clear_mc() will do uncharge later */
6594 return ret; 6589 return ret;
6595 mc.precharge++; 6590 mc.precharge++;
6596 } 6591 }
6597 return ret; 6592 return ret;
6598 } 6593 }
6599 6594
6600 /** 6595 /**
6601 * get_mctgt_type - get target type of moving charge 6596 * get_mctgt_type - get target type of moving charge
6602 * @vma: the vma the pte to be checked belongs 6597 * @vma: the vma the pte to be checked belongs
6603 * @addr: the address corresponding to the pte to be checked 6598 * @addr: the address corresponding to the pte to be checked
6604 * @ptent: the pte to be checked 6599 * @ptent: the pte to be checked
6605 * @target: the pointer the target page or swap ent will be stored(can be NULL) 6600 * @target: the pointer the target page or swap ent will be stored(can be NULL)
6606 * 6601 *
6607 * Returns 6602 * Returns
6608 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 6603 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
6609 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 6604 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
6610 * move charge. if @target is not NULL, the page is stored in target->page 6605 * move charge. if @target is not NULL, the page is stored in target->page
6611 * with extra refcnt got(Callers should handle it). 6606 * with extra refcnt got(Callers should handle it).
6612 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 6607 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
6613 * target for charge migration. if @target is not NULL, the entry is stored 6608 * target for charge migration. if @target is not NULL, the entry is stored
6614 * in target->ent. 6609 * in target->ent.
6615 * 6610 *
6616 * Called with pte lock held. 6611 * Called with pte lock held.
6617 */ 6612 */
6618 union mc_target { 6613 union mc_target {
6619 struct page *page; 6614 struct page *page;
6620 swp_entry_t ent; 6615 swp_entry_t ent;
6621 }; 6616 };
6622 6617
6623 enum mc_target_type { 6618 enum mc_target_type {
6624 MC_TARGET_NONE = 0, 6619 MC_TARGET_NONE = 0,
6625 MC_TARGET_PAGE, 6620 MC_TARGET_PAGE,
6626 MC_TARGET_SWAP, 6621 MC_TARGET_SWAP,
6627 }; 6622 };
6628 6623
6629 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 6624 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
6630 unsigned long addr, pte_t ptent) 6625 unsigned long addr, pte_t ptent)
6631 { 6626 {
6632 struct page *page = vm_normal_page(vma, addr, ptent); 6627 struct page *page = vm_normal_page(vma, addr, ptent);
6633 6628
6634 if (!page || !page_mapped(page)) 6629 if (!page || !page_mapped(page))
6635 return NULL; 6630 return NULL;
6636 if (PageAnon(page)) { 6631 if (PageAnon(page)) {
6637 /* we don't move shared anon */ 6632 /* we don't move shared anon */
6638 if (!move_anon()) 6633 if (!move_anon())
6639 return NULL; 6634 return NULL;
6640 } else if (!move_file()) 6635 } else if (!move_file())
6641 /* we ignore mapcount for file pages */ 6636 /* we ignore mapcount for file pages */
6642 return NULL; 6637 return NULL;
6643 if (!get_page_unless_zero(page)) 6638 if (!get_page_unless_zero(page))
6644 return NULL; 6639 return NULL;
6645 6640
6646 return page; 6641 return page;
6647 } 6642 }
6648 6643
6649 #ifdef CONFIG_SWAP 6644 #ifdef CONFIG_SWAP
6650 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6645 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6651 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6646 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6652 { 6647 {
6653 struct page *page = NULL; 6648 struct page *page = NULL;
6654 swp_entry_t ent = pte_to_swp_entry(ptent); 6649 swp_entry_t ent = pte_to_swp_entry(ptent);
6655 6650
6656 if (!move_anon() || non_swap_entry(ent)) 6651 if (!move_anon() || non_swap_entry(ent))
6657 return NULL; 6652 return NULL;
6658 /* 6653 /*
6659 * Because lookup_swap_cache() updates some statistics counter, 6654 * Because lookup_swap_cache() updates some statistics counter,
6660 * we call find_get_page() with swapper_space directly. 6655 * we call find_get_page() with swapper_space directly.
6661 */ 6656 */
6662 page = find_get_page(swap_address_space(ent), ent.val); 6657 page = find_get_page(swap_address_space(ent), ent.val);
6663 if (do_swap_account) 6658 if (do_swap_account)
6664 entry->val = ent.val; 6659 entry->val = ent.val;
6665 6660
6666 return page; 6661 return page;
6667 } 6662 }
6668 #else 6663 #else
6669 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 6664 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
6670 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6665 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6671 { 6666 {
6672 return NULL; 6667 return NULL;
6673 } 6668 }
6674 #endif 6669 #endif
6675 6670
6676 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 6671 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
6677 unsigned long addr, pte_t ptent, swp_entry_t *entry) 6672 unsigned long addr, pte_t ptent, swp_entry_t *entry)
6678 { 6673 {
6679 struct page *page = NULL; 6674 struct page *page = NULL;
6680 struct address_space *mapping; 6675 struct address_space *mapping;
6681 pgoff_t pgoff; 6676 pgoff_t pgoff;
6682 6677
6683 if (!vma->vm_file) /* anonymous vma */ 6678 if (!vma->vm_file) /* anonymous vma */
6684 return NULL; 6679 return NULL;
6685 if (!move_file()) 6680 if (!move_file())
6686 return NULL; 6681 return NULL;
6687 6682
6688 mapping = vma->vm_file->f_mapping; 6683 mapping = vma->vm_file->f_mapping;
6689 if (pte_none(ptent)) 6684 if (pte_none(ptent))
6690 pgoff = linear_page_index(vma, addr); 6685 pgoff = linear_page_index(vma, addr);
6691 else /* pte_file(ptent) is true */ 6686 else /* pte_file(ptent) is true */
6692 pgoff = pte_to_pgoff(ptent); 6687 pgoff = pte_to_pgoff(ptent);
6693 6688
6694 /* page is moved even if it's not RSS of this task(page-faulted). */ 6689 /* page is moved even if it's not RSS of this task(page-faulted). */
6695 page = find_get_page(mapping, pgoff); 6690 page = find_get_page(mapping, pgoff);
6696 6691
6697 #ifdef CONFIG_SWAP 6692 #ifdef CONFIG_SWAP
6698 /* shmem/tmpfs may report page out on swap: account for that too. */ 6693 /* shmem/tmpfs may report page out on swap: account for that too. */
6699 if (radix_tree_exceptional_entry(page)) { 6694 if (radix_tree_exceptional_entry(page)) {
6700 swp_entry_t swap = radix_to_swp_entry(page); 6695 swp_entry_t swap = radix_to_swp_entry(page);
6701 if (do_swap_account) 6696 if (do_swap_account)
6702 *entry = swap; 6697 *entry = swap;
6703 page = find_get_page(swap_address_space(swap), swap.val); 6698 page = find_get_page(swap_address_space(swap), swap.val);
6704 } 6699 }
6705 #endif 6700 #endif
6706 return page; 6701 return page;
6707 } 6702 }
6708 6703
6709 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, 6704 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
6710 unsigned long addr, pte_t ptent, union mc_target *target) 6705 unsigned long addr, pte_t ptent, union mc_target *target)
6711 { 6706 {
6712 struct page *page = NULL; 6707 struct page *page = NULL;
6713 struct page_cgroup *pc; 6708 struct page_cgroup *pc;
6714 enum mc_target_type ret = MC_TARGET_NONE; 6709 enum mc_target_type ret = MC_TARGET_NONE;
6715 swp_entry_t ent = { .val = 0 }; 6710 swp_entry_t ent = { .val = 0 };
6716 6711
6717 if (pte_present(ptent)) 6712 if (pte_present(ptent))
6718 page = mc_handle_present_pte(vma, addr, ptent); 6713 page = mc_handle_present_pte(vma, addr, ptent);
6719 else if (is_swap_pte(ptent)) 6714 else if (is_swap_pte(ptent))
6720 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 6715 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
6721 else if (pte_none(ptent) || pte_file(ptent)) 6716 else if (pte_none(ptent) || pte_file(ptent))
6722 page = mc_handle_file_pte(vma, addr, ptent, &ent); 6717 page = mc_handle_file_pte(vma, addr, ptent, &ent);
6723 6718
6724 if (!page && !ent.val) 6719 if (!page && !ent.val)
6725 return ret; 6720 return ret;
6726 if (page) { 6721 if (page) {
6727 pc = lookup_page_cgroup(page); 6722 pc = lookup_page_cgroup(page);
6728 /* 6723 /*
6729 * Do only loose check w/o page_cgroup lock. 6724 * Do only loose check w/o page_cgroup lock.
6730 * mem_cgroup_move_account() checks the pc is valid or not under 6725 * mem_cgroup_move_account() checks the pc is valid or not under
6731 * the lock. 6726 * the lock.
6732 */ 6727 */
6733 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6728 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6734 ret = MC_TARGET_PAGE; 6729 ret = MC_TARGET_PAGE;
6735 if (target) 6730 if (target)
6736 target->page = page; 6731 target->page = page;
6737 } 6732 }
6738 if (!ret || !target) 6733 if (!ret || !target)
6739 put_page(page); 6734 put_page(page);
6740 } 6735 }
6741 /* There is a swap entry and a page doesn't exist or isn't charged */ 6736 /* There is a swap entry and a page doesn't exist or isn't charged */
6742 if (ent.val && !ret && 6737 if (ent.val && !ret &&
6743 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { 6738 mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
6744 ret = MC_TARGET_SWAP; 6739 ret = MC_TARGET_SWAP;
6745 if (target) 6740 if (target)
6746 target->ent = ent; 6741 target->ent = ent;
6747 } 6742 }
6748 return ret; 6743 return ret;
6749 } 6744 }
6750 6745
6751 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 6746 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
6752 /* 6747 /*
6753 * We don't consider swapping or file mapped pages because THP does not 6748 * We don't consider swapping or file mapped pages because THP does not
6754 * support them for now. 6749 * support them for now.
6755 * Caller should make sure that pmd_trans_huge(pmd) is true. 6750 * Caller should make sure that pmd_trans_huge(pmd) is true.
6756 */ 6751 */
6757 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6752 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6758 unsigned long addr, pmd_t pmd, union mc_target *target) 6753 unsigned long addr, pmd_t pmd, union mc_target *target)
6759 { 6754 {
6760 struct page *page = NULL; 6755 struct page *page = NULL;
6761 struct page_cgroup *pc; 6756 struct page_cgroup *pc;
6762 enum mc_target_type ret = MC_TARGET_NONE; 6757 enum mc_target_type ret = MC_TARGET_NONE;
6763 6758
6764 page = pmd_page(pmd); 6759 page = pmd_page(pmd);
6765 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 6760 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
6766 if (!move_anon()) 6761 if (!move_anon())
6767 return ret; 6762 return ret;
6768 pc = lookup_page_cgroup(page); 6763 pc = lookup_page_cgroup(page);
6769 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 6764 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
6770 ret = MC_TARGET_PAGE; 6765 ret = MC_TARGET_PAGE;
6771 if (target) { 6766 if (target) {
6772 get_page(page); 6767 get_page(page);
6773 target->page = page; 6768 target->page = page;
6774 } 6769 }
6775 } 6770 }
6776 return ret; 6771 return ret;
6777 } 6772 }
6778 #else 6773 #else
6779 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, 6774 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
6780 unsigned long addr, pmd_t pmd, union mc_target *target) 6775 unsigned long addr, pmd_t pmd, union mc_target *target)
6781 { 6776 {
6782 return MC_TARGET_NONE; 6777 return MC_TARGET_NONE;
6783 } 6778 }
6784 #endif 6779 #endif
6785 6780
6786 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 6781 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
6787 unsigned long addr, unsigned long end, 6782 unsigned long addr, unsigned long end,
6788 struct mm_walk *walk) 6783 struct mm_walk *walk)
6789 { 6784 {
6790 struct vm_area_struct *vma = walk->private; 6785 struct vm_area_struct *vma = walk->private;
6791 pte_t *pte; 6786 pte_t *pte;
6792 spinlock_t *ptl; 6787 spinlock_t *ptl;
6793 6788
6794 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 6789 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6795 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) 6790 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
6796 mc.precharge += HPAGE_PMD_NR; 6791 mc.precharge += HPAGE_PMD_NR;
6797 spin_unlock(ptl); 6792 spin_unlock(ptl);
6798 return 0; 6793 return 0;
6799 } 6794 }
6800 6795
6801 if (pmd_trans_unstable(pmd)) 6796 if (pmd_trans_unstable(pmd))
6802 return 0; 6797 return 0;
6803 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 6798 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6804 for (; addr != end; pte++, addr += PAGE_SIZE) 6799 for (; addr != end; pte++, addr += PAGE_SIZE)
6805 if (get_mctgt_type(vma, addr, *pte, NULL)) 6800 if (get_mctgt_type(vma, addr, *pte, NULL))
6806 mc.precharge++; /* increment precharge temporarily */ 6801 mc.precharge++; /* increment precharge temporarily */
6807 pte_unmap_unlock(pte - 1, ptl); 6802 pte_unmap_unlock(pte - 1, ptl);
6808 cond_resched(); 6803 cond_resched();
6809 6804
6810 return 0; 6805 return 0;
6811 } 6806 }
6812 6807
6813 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 6808 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
6814 { 6809 {
6815 unsigned long precharge; 6810 unsigned long precharge;
6816 struct vm_area_struct *vma; 6811 struct vm_area_struct *vma;
6817 6812
6818 down_read(&mm->mmap_sem); 6813 down_read(&mm->mmap_sem);
6819 for (vma = mm->mmap; vma; vma = vma->vm_next) { 6814 for (vma = mm->mmap; vma; vma = vma->vm_next) {
6820 struct mm_walk mem_cgroup_count_precharge_walk = { 6815 struct mm_walk mem_cgroup_count_precharge_walk = {
6821 .pmd_entry = mem_cgroup_count_precharge_pte_range, 6816 .pmd_entry = mem_cgroup_count_precharge_pte_range,
6822 .mm = mm, 6817 .mm = mm,
6823 .private = vma, 6818 .private = vma,
6824 }; 6819 };
6825 if (is_vm_hugetlb_page(vma)) 6820 if (is_vm_hugetlb_page(vma))
6826 continue; 6821 continue;
6827 walk_page_range(vma->vm_start, vma->vm_end, 6822 walk_page_range(vma->vm_start, vma->vm_end,
6828 &mem_cgroup_count_precharge_walk); 6823 &mem_cgroup_count_precharge_walk);
6829 } 6824 }
6830 up_read(&mm->mmap_sem); 6825 up_read(&mm->mmap_sem);
6831 6826
6832 precharge = mc.precharge; 6827 precharge = mc.precharge;
6833 mc.precharge = 0; 6828 mc.precharge = 0;
6834 6829
6835 return precharge; 6830 return precharge;
6836 } 6831 }
6837 6832
6838 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 6833 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
6839 { 6834 {
6840 unsigned long precharge = mem_cgroup_count_precharge(mm); 6835 unsigned long precharge = mem_cgroup_count_precharge(mm);
6841 6836
6842 VM_BUG_ON(mc.moving_task); 6837 VM_BUG_ON(mc.moving_task);
6843 mc.moving_task = current; 6838 mc.moving_task = current;
6844 return mem_cgroup_do_precharge(precharge); 6839 return mem_cgroup_do_precharge(precharge);
6845 } 6840 }
6846 6841
6847 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 6842 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
6848 static void __mem_cgroup_clear_mc(void) 6843 static void __mem_cgroup_clear_mc(void)
6849 { 6844 {
6850 struct mem_cgroup *from = mc.from; 6845 struct mem_cgroup *from = mc.from;
6851 struct mem_cgroup *to = mc.to; 6846 struct mem_cgroup *to = mc.to;
6852 int i; 6847 int i;
6853 6848
6854 /* we must uncharge all the leftover precharges from mc.to */ 6849 /* we must uncharge all the leftover precharges from mc.to */
6855 if (mc.precharge) { 6850 if (mc.precharge) {
6856 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 6851 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
6857 mc.precharge = 0; 6852 mc.precharge = 0;
6858 } 6853 }
6859 /* 6854 /*
6860 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 6855 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
6861 * we must uncharge here. 6856 * we must uncharge here.
6862 */ 6857 */
6863 if (mc.moved_charge) { 6858 if (mc.moved_charge) {
6864 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 6859 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
6865 mc.moved_charge = 0; 6860 mc.moved_charge = 0;
6866 } 6861 }
6867 /* we must fixup refcnts and charges */ 6862 /* we must fixup refcnts and charges */
6868 if (mc.moved_swap) { 6863 if (mc.moved_swap) {
6869 /* uncharge swap account from the old cgroup */ 6864 /* uncharge swap account from the old cgroup */
6870 if (!mem_cgroup_is_root(mc.from)) 6865 if (!mem_cgroup_is_root(mc.from))
6871 res_counter_uncharge(&mc.from->memsw, 6866 res_counter_uncharge(&mc.from->memsw,
6872 PAGE_SIZE * mc.moved_swap); 6867 PAGE_SIZE * mc.moved_swap);
6873 6868
6874 for (i = 0; i < mc.moved_swap; i++) 6869 for (i = 0; i < mc.moved_swap; i++)
6875 css_put(&mc.from->css); 6870 css_put(&mc.from->css);
6876 6871
6877 if (!mem_cgroup_is_root(mc.to)) { 6872 if (!mem_cgroup_is_root(mc.to)) {
6878 /* 6873 /*
6879 * we charged both to->res and to->memsw, so we should 6874 * we charged both to->res and to->memsw, so we should
6880 * uncharge to->res. 6875 * uncharge to->res.
6881 */ 6876 */
6882 res_counter_uncharge(&mc.to->res, 6877 res_counter_uncharge(&mc.to->res,
6883 PAGE_SIZE * mc.moved_swap); 6878 PAGE_SIZE * mc.moved_swap);
6884 } 6879 }
6885 /* we've already done css_get(mc.to) */ 6880 /* we've already done css_get(mc.to) */
6886 mc.moved_swap = 0; 6881 mc.moved_swap = 0;
6887 } 6882 }
6888 memcg_oom_recover(from); 6883 memcg_oom_recover(from);
6889 memcg_oom_recover(to); 6884 memcg_oom_recover(to);
6890 wake_up_all(&mc.waitq); 6885 wake_up_all(&mc.waitq);
6891 } 6886 }
6892 6887
6893 static void mem_cgroup_clear_mc(void) 6888 static void mem_cgroup_clear_mc(void)
6894 { 6889 {
6895 struct mem_cgroup *from = mc.from; 6890 struct mem_cgroup *from = mc.from;
6896 6891
6897 /* 6892 /*
6898 * we must clear moving_task before waking up waiters at the end of 6893 * we must clear moving_task before waking up waiters at the end of
6899 * task migration. 6894 * task migration.
6900 */ 6895 */
6901 mc.moving_task = NULL; 6896 mc.moving_task = NULL;
6902 __mem_cgroup_clear_mc(); 6897 __mem_cgroup_clear_mc();
6903 spin_lock(&mc.lock); 6898 spin_lock(&mc.lock);
6904 mc.from = NULL; 6899 mc.from = NULL;
6905 mc.to = NULL; 6900 mc.to = NULL;
6906 spin_unlock(&mc.lock); 6901 spin_unlock(&mc.lock);
6907 mem_cgroup_end_move(from); 6902 mem_cgroup_end_move(from);
6908 } 6903 }
6909 6904
6910 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 6905 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
6911 struct cgroup_taskset *tset) 6906 struct cgroup_taskset *tset)
6912 { 6907 {
6913 struct task_struct *p = cgroup_taskset_first(tset); 6908 struct task_struct *p = cgroup_taskset_first(tset);
6914 int ret = 0; 6909 int ret = 0;
6915 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6910 struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6916 unsigned long move_charge_at_immigrate; 6911 unsigned long move_charge_at_immigrate;
6917 6912
6918 /* 6913 /*
6919 * We are now commited to this value whatever it is. Changes in this 6914 * We are now commited to this value whatever it is. Changes in this
6920 * tunable will only affect upcoming migrations, not the current one. 6915 * tunable will only affect upcoming migrations, not the current one.
6921 * So we need to save it, and keep it going. 6916 * So we need to save it, and keep it going.
6922 */ 6917 */
6923 move_charge_at_immigrate = memcg->move_charge_at_immigrate; 6918 move_charge_at_immigrate = memcg->move_charge_at_immigrate;
6924 if (move_charge_at_immigrate) { 6919 if (move_charge_at_immigrate) {
6925 struct mm_struct *mm; 6920 struct mm_struct *mm;
6926 struct mem_cgroup *from = mem_cgroup_from_task(p); 6921 struct mem_cgroup *from = mem_cgroup_from_task(p);
6927 6922
6928 VM_BUG_ON(from == memcg); 6923 VM_BUG_ON(from == memcg);
6929 6924
6930 mm = get_task_mm(p); 6925 mm = get_task_mm(p);
6931 if (!mm) 6926 if (!mm)
6932 return 0; 6927 return 0;
6933 /* We move charges only when we move a owner of the mm */ 6928 /* We move charges only when we move a owner of the mm */
6934 if (mm->owner == p) { 6929 if (mm->owner == p) {
6935 VM_BUG_ON(mc.from); 6930 VM_BUG_ON(mc.from);
6936 VM_BUG_ON(mc.to); 6931 VM_BUG_ON(mc.to);
6937 VM_BUG_ON(mc.precharge); 6932 VM_BUG_ON(mc.precharge);
6938 VM_BUG_ON(mc.moved_charge); 6933 VM_BUG_ON(mc.moved_charge);
6939 VM_BUG_ON(mc.moved_swap); 6934 VM_BUG_ON(mc.moved_swap);
6940 mem_cgroup_start_move(from); 6935 mem_cgroup_start_move(from);
6941 spin_lock(&mc.lock); 6936 spin_lock(&mc.lock);
6942 mc.from = from; 6937 mc.from = from;
6943 mc.to = memcg; 6938 mc.to = memcg;
6944 mc.immigrate_flags = move_charge_at_immigrate; 6939 mc.immigrate_flags = move_charge_at_immigrate;
6945 spin_unlock(&mc.lock); 6940 spin_unlock(&mc.lock);
6946 /* We set mc.moving_task later */ 6941 /* We set mc.moving_task later */
6947 6942
6948 ret = mem_cgroup_precharge_mc(mm); 6943 ret = mem_cgroup_precharge_mc(mm);
6949 if (ret) 6944 if (ret)
6950 mem_cgroup_clear_mc(); 6945 mem_cgroup_clear_mc();
6951 } 6946 }
6952 mmput(mm); 6947 mmput(mm);
6953 } 6948 }
6954 return ret; 6949 return ret;
6955 } 6950 }
6956 6951
6957 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 6952 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
6958 struct cgroup_taskset *tset) 6953 struct cgroup_taskset *tset)
6959 { 6954 {
6960 mem_cgroup_clear_mc(); 6955 mem_cgroup_clear_mc();
6961 } 6956 }
6962 6957
6963 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 6958 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6964 unsigned long addr, unsigned long end, 6959 unsigned long addr, unsigned long end,
6965 struct mm_walk *walk) 6960 struct mm_walk *walk)
6966 { 6961 {
6967 int ret = 0; 6962 int ret = 0;
6968 struct vm_area_struct *vma = walk->private; 6963 struct vm_area_struct *vma = walk->private;
6969 pte_t *pte; 6964 pte_t *pte;
6970 spinlock_t *ptl; 6965 spinlock_t *ptl;
6971 enum mc_target_type target_type; 6966 enum mc_target_type target_type;
6972 union mc_target target; 6967 union mc_target target;
6973 struct page *page; 6968 struct page *page;
6974 struct page_cgroup *pc; 6969 struct page_cgroup *pc;
6975 6970
6976 /* 6971 /*
6977 * We don't take compound_lock() here but no race with splitting thp 6972 * We don't take compound_lock() here but no race with splitting thp
6978 * happens because: 6973 * happens because:
6979 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not 6974 * - if pmd_trans_huge_lock() returns 1, the relevant thp is not
6980 * under splitting, which means there's no concurrent thp split, 6975 * under splitting, which means there's no concurrent thp split,
6981 * - if another thread runs into split_huge_page() just after we 6976 * - if another thread runs into split_huge_page() just after we
6982 * entered this if-block, the thread must wait for page table lock 6977 * entered this if-block, the thread must wait for page table lock
6983 * to be unlocked in __split_huge_page_splitting(), where the main 6978 * to be unlocked in __split_huge_page_splitting(), where the main
6984 * part of thp split is not executed yet. 6979 * part of thp split is not executed yet.
6985 */ 6980 */
6986 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 6981 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
6987 if (mc.precharge < HPAGE_PMD_NR) { 6982 if (mc.precharge < HPAGE_PMD_NR) {
6988 spin_unlock(ptl); 6983 spin_unlock(ptl);
6989 return 0; 6984 return 0;
6990 } 6985 }
6991 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); 6986 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6992 if (target_type == MC_TARGET_PAGE) { 6987 if (target_type == MC_TARGET_PAGE) {
6993 page = target.page; 6988 page = target.page;
6994 if (!isolate_lru_page(page)) { 6989 if (!isolate_lru_page(page)) {
6995 pc = lookup_page_cgroup(page); 6990 pc = lookup_page_cgroup(page);
6996 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 6991 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
6997 pc, mc.from, mc.to)) { 6992 pc, mc.from, mc.to)) {
6998 mc.precharge -= HPAGE_PMD_NR; 6993 mc.precharge -= HPAGE_PMD_NR;
6999 mc.moved_charge += HPAGE_PMD_NR; 6994 mc.moved_charge += HPAGE_PMD_NR;
7000 } 6995 }
7001 putback_lru_page(page); 6996 putback_lru_page(page);
7002 } 6997 }
7003 put_page(page); 6998 put_page(page);
7004 } 6999 }
7005 spin_unlock(ptl); 7000 spin_unlock(ptl);
7006 return 0; 7001 return 0;
7007 } 7002 }
7008 7003
7009 if (pmd_trans_unstable(pmd)) 7004 if (pmd_trans_unstable(pmd))
7010 return 0; 7005 return 0;
7011 retry: 7006 retry:
7012 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 7007 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
7013 for (; addr != end; addr += PAGE_SIZE) { 7008 for (; addr != end; addr += PAGE_SIZE) {
7014 pte_t ptent = *(pte++); 7009 pte_t ptent = *(pte++);
7015 swp_entry_t ent; 7010 swp_entry_t ent;
7016 7011
7017 if (!mc.precharge) 7012 if (!mc.precharge)
7018 break; 7013 break;
7019 7014
7020 switch (get_mctgt_type(vma, addr, ptent, &target)) { 7015 switch (get_mctgt_type(vma, addr, ptent, &target)) {
7021 case MC_TARGET_PAGE: 7016 case MC_TARGET_PAGE:
7022 page = target.page; 7017 page = target.page;
7023 if (isolate_lru_page(page)) 7018 if (isolate_lru_page(page))
7024 goto put; 7019 goto put;
7025 pc = lookup_page_cgroup(page); 7020 pc = lookup_page_cgroup(page);
7026 if (!mem_cgroup_move_account(page, 1, pc, 7021 if (!mem_cgroup_move_account(page, 1, pc,
7027 mc.from, mc.to)) { 7022 mc.from, mc.to)) {
7028 mc.precharge--; 7023 mc.precharge--;
7029 /* we uncharge from mc.from later. */ 7024 /* we uncharge from mc.from later. */
7030 mc.moved_charge++; 7025 mc.moved_charge++;
7031 } 7026 }
7032 putback_lru_page(page); 7027 putback_lru_page(page);
7033 put: /* get_mctgt_type() gets the page */ 7028 put: /* get_mctgt_type() gets the page */
7034 put_page(page); 7029 put_page(page);
7035 break; 7030 break;
7036 case MC_TARGET_SWAP: 7031 case MC_TARGET_SWAP:
7037 ent = target.ent; 7032 ent = target.ent;
7038 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { 7033 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
7039 mc.precharge--; 7034 mc.precharge--;
7040 /* we fixup refcnts and charges later. */ 7035 /* we fixup refcnts and charges later. */
7041 mc.moved_swap++; 7036 mc.moved_swap++;
7042 } 7037 }
7043 break; 7038 break;
7044 default: 7039 default:
7045 break; 7040 break;
7046 } 7041 }
7047 } 7042 }
7048 pte_unmap_unlock(pte - 1, ptl); 7043 pte_unmap_unlock(pte - 1, ptl);
7049 cond_resched(); 7044 cond_resched();
7050 7045
7051 if (addr != end) { 7046 if (addr != end) {
7052 /* 7047 /*
7053 * We have consumed all precharges we got in can_attach(). 7048 * We have consumed all precharges we got in can_attach().
7054 * We try charge one by one, but don't do any additional 7049 * We try charge one by one, but don't do any additional
7055 * charges to mc.to if we have failed in charge once in attach() 7050 * charges to mc.to if we have failed in charge once in attach()
7056 * phase. 7051 * phase.
7057 */ 7052 */
7058 ret = mem_cgroup_do_precharge(1); 7053 ret = mem_cgroup_do_precharge(1);
7059 if (!ret) 7054 if (!ret)
7060 goto retry; 7055 goto retry;
7061 } 7056 }
7062 7057
7063 return ret; 7058 return ret;
7064 } 7059 }
7065 7060
7066 static void mem_cgroup_move_charge(struct mm_struct *mm) 7061 static void mem_cgroup_move_charge(struct mm_struct *mm)
7067 { 7062 {
7068 struct vm_area_struct *vma; 7063 struct vm_area_struct *vma;
7069 7064
7070 lru_add_drain_all(); 7065 lru_add_drain_all();
7071 retry: 7066 retry:
7072 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 7067 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
7073 /* 7068 /*
7074 * Someone who are holding the mmap_sem might be waiting in 7069 * Someone who are holding the mmap_sem might be waiting in
7075 * waitq. So we cancel all extra charges, wake up all waiters, 7070 * waitq. So we cancel all extra charges, wake up all waiters,
7076 * and retry. Because we cancel precharges, we might not be able 7071 * and retry. Because we cancel precharges, we might not be able
7077 * to move enough charges, but moving charge is a best-effort 7072 * to move enough charges, but moving charge is a best-effort
7078 * feature anyway, so it wouldn't be a big problem. 7073 * feature anyway, so it wouldn't be a big problem.
7079 */ 7074 */
7080 __mem_cgroup_clear_mc(); 7075 __mem_cgroup_clear_mc();
7081 cond_resched(); 7076 cond_resched();
7082 goto retry; 7077 goto retry;
7083 } 7078 }
7084 for (vma = mm->mmap; vma; vma = vma->vm_next) { 7079 for (vma = mm->mmap; vma; vma = vma->vm_next) {
7085 int ret; 7080 int ret;
7086 struct mm_walk mem_cgroup_move_charge_walk = { 7081 struct mm_walk mem_cgroup_move_charge_walk = {
7087 .pmd_entry = mem_cgroup_move_charge_pte_range, 7082 .pmd_entry = mem_cgroup_move_charge_pte_range,
7088 .mm = mm, 7083 .mm = mm,
7089 .private = vma, 7084 .private = vma,
7090 }; 7085 };
7091 if (is_vm_hugetlb_page(vma)) 7086 if (is_vm_hugetlb_page(vma))
7092 continue; 7087 continue;
7093 ret = walk_page_range(vma->vm_start, vma->vm_end, 7088 ret = walk_page_range(vma->vm_start, vma->vm_end,
7094 &mem_cgroup_move_charge_walk); 7089 &mem_cgroup_move_charge_walk);
7095 if (ret) 7090 if (ret)
7096 /* 7091 /*
7097 * means we have consumed all precharges and failed in 7092 * means we have consumed all precharges and failed in
7098 * doing additional charge. Just abandon here. 7093 * doing additional charge. Just abandon here.
7099 */ 7094 */
7100 break; 7095 break;
7101 } 7096 }
7102 up_read(&mm->mmap_sem); 7097 up_read(&mm->mmap_sem);
7103 } 7098 }
7104 7099
7105 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 7100 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
7106 struct cgroup_taskset *tset) 7101 struct cgroup_taskset *tset)
7107 { 7102 {
7108 struct task_struct *p = cgroup_taskset_first(tset); 7103 struct task_struct *p = cgroup_taskset_first(tset);
7109 struct mm_struct *mm = get_task_mm(p); 7104 struct mm_struct *mm = get_task_mm(p);
7110 7105
7111 if (mm) { 7106 if (mm) {
7112 if (mc.to) 7107 if (mc.to)
7113 mem_cgroup_move_charge(mm); 7108 mem_cgroup_move_charge(mm);
7114 mmput(mm); 7109 mmput(mm);
7115 } 7110 }
7116 if (mc.to) 7111 if (mc.to)
7117 mem_cgroup_clear_mc(); 7112 mem_cgroup_clear_mc();
7118 } 7113 }
7119 #else /* !CONFIG_MMU */ 7114 #else /* !CONFIG_MMU */
7120 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, 7115 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
7121 struct cgroup_taskset *tset) 7116 struct cgroup_taskset *tset)
7122 { 7117 {
7123 return 0; 7118 return 0;
7124 } 7119 }
7125 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, 7120 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
7126 struct cgroup_taskset *tset) 7121 struct cgroup_taskset *tset)
7127 { 7122 {
7128 } 7123 }
7129 static void mem_cgroup_move_task(struct cgroup_subsys_state *css, 7124 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
7130 struct cgroup_taskset *tset) 7125 struct cgroup_taskset *tset)
7131 { 7126 {
7132 } 7127 }
7133 #endif 7128 #endif
7134 7129
7135 /* 7130 /*
7136 * Cgroup retains root cgroups across [un]mount cycles making it necessary 7131 * Cgroup retains root cgroups across [un]mount cycles making it necessary
7137 * to verify sane_behavior flag on each mount attempt. 7132 * to verify sane_behavior flag on each mount attempt.
7138 */ 7133 */
7139 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 7134 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
7140 { 7135 {
7141 /* 7136 /*
7142 * use_hierarchy is forced with sane_behavior. cgroup core 7137 * use_hierarchy is forced with sane_behavior. cgroup core
7143 * guarantees that @root doesn't have any children, so turning it 7138 * guarantees that @root doesn't have any children, so turning it
7144 * on for the root memcg is enough. 7139 * on for the root memcg is enough.
7145 */ 7140 */
7146 if (cgroup_sane_behavior(root_css->cgroup)) 7141 if (cgroup_sane_behavior(root_css->cgroup))
7147 mem_cgroup_from_css(root_css)->use_hierarchy = true; 7142 mem_cgroup_from_css(root_css)->use_hierarchy = true;
7148 } 7143 }
7149 7144
7150 struct cgroup_subsys memory_cgrp_subsys = { 7145 struct cgroup_subsys memory_cgrp_subsys = {
7151 .css_alloc = mem_cgroup_css_alloc, 7146 .css_alloc = mem_cgroup_css_alloc,
7152 .css_online = mem_cgroup_css_online, 7147 .css_online = mem_cgroup_css_online,
7153 .css_offline = mem_cgroup_css_offline, 7148 .css_offline = mem_cgroup_css_offline,
7154 .css_free = mem_cgroup_css_free, 7149 .css_free = mem_cgroup_css_free,
7155 .can_attach = mem_cgroup_can_attach, 7150 .can_attach = mem_cgroup_can_attach,
7156 .cancel_attach = mem_cgroup_cancel_attach, 7151 .cancel_attach = mem_cgroup_cancel_attach,
7157 .attach = mem_cgroup_move_task, 7152 .attach = mem_cgroup_move_task,
7158 .bind = mem_cgroup_bind, 7153 .bind = mem_cgroup_bind,
7159 .base_cftypes = mem_cgroup_files, 7154 .base_cftypes = mem_cgroup_files,
7160 .early_init = 0, 7155 .early_init = 0,
7161 }; 7156 };
7162 7157
7163 #ifdef CONFIG_MEMCG_SWAP 7158 #ifdef CONFIG_MEMCG_SWAP
7164 static int __init enable_swap_account(char *s) 7159 static int __init enable_swap_account(char *s)
7165 { 7160 {
7166 if (!strcmp(s, "1")) 7161 if (!strcmp(s, "1"))
7167 really_do_swap_account = 1; 7162 really_do_swap_account = 1;
7168 else if (!strcmp(s, "0")) 7163 else if (!strcmp(s, "0"))
7169 really_do_swap_account = 0; 7164 really_do_swap_account = 0;
7170 return 1; 7165 return 1;
7171 } 7166 }
7172 __setup("swapaccount=", enable_swap_account); 7167 __setup("swapaccount=", enable_swap_account);
7173 7168
7174 static void __init memsw_file_init(void) 7169 static void __init memsw_file_init(void)
7175 { 7170 {
7176 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); 7171 WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files));
7177 } 7172 }
7178 7173
7179 static void __init enable_swap_cgroup(void) 7174 static void __init enable_swap_cgroup(void)
7180 { 7175 {
7181 if (!mem_cgroup_disabled() && really_do_swap_account) { 7176 if (!mem_cgroup_disabled() && really_do_swap_account) {
7182 do_swap_account = 1; 7177 do_swap_account = 1;
7183 memsw_file_init(); 7178 memsw_file_init();
7184 } 7179 }
7185 } 7180 }
7186 7181
7187 #else 7182 #else
7188 static void __init enable_swap_cgroup(void) 7183 static void __init enable_swap_cgroup(void)
7189 { 7184 {
7190 } 7185 }
7191 #endif 7186 #endif
7192 7187
7193 /* 7188 /*
7194 * subsys_initcall() for memory controller. 7189 * subsys_initcall() for memory controller.
7195 * 7190 *
7196 * Some parts like hotcpu_notifier() have to be initialized from this context 7191 * Some parts like hotcpu_notifier() have to be initialized from this context
7197 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically 7192 * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
7198 * everything that doesn't depend on a specific mem_cgroup structure should 7193 * everything that doesn't depend on a specific mem_cgroup structure should
7199 * be initialized from here. 7194 * be initialized from here.
7200 */ 7195 */
7201 static int __init mem_cgroup_init(void) 7196 static int __init mem_cgroup_init(void)
7202 { 7197 {
7203 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 7198 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
7204 enable_swap_cgroup(); 7199 enable_swap_cgroup();
7205 mem_cgroup_soft_limit_tree_init(); 7200 mem_cgroup_soft_limit_tree_init();
7206 memcg_stock_init(); 7201 memcg_stock_init();
7207 return 0; 7202 return 0;
7208 } 7203 }
7209 subsys_initcall(mem_cgroup_init); 7204 subsys_initcall(mem_cgroup_init);
7210 7205
1 /* 1 /*
2 * Slab allocator functions that are independent of the allocator strategy 2 * Slab allocator functions that are independent of the allocator strategy
3 * 3 *
4 * (C) 2012 Christoph Lameter <cl@linux.com> 4 * (C) 2012 Christoph Lameter <cl@linux.com>
5 */ 5 */
6 #include <linux/slab.h> 6 #include <linux/slab.h>
7 7
8 #include <linux/mm.h> 8 #include <linux/mm.h>
9 #include <linux/poison.h> 9 #include <linux/poison.h>
10 #include <linux/interrupt.h> 10 #include <linux/interrupt.h>
11 #include <linux/memory.h> 11 #include <linux/memory.h>
12 #include <linux/compiler.h> 12 #include <linux/compiler.h>
13 #include <linux/module.h> 13 #include <linux/module.h>
14 #include <linux/cpu.h> 14 #include <linux/cpu.h>
15 #include <linux/uaccess.h> 15 #include <linux/uaccess.h>
16 #include <linux/seq_file.h> 16 #include <linux/seq_file.h>
17 #include <linux/proc_fs.h> 17 #include <linux/proc_fs.h>
18 #include <asm/cacheflush.h> 18 #include <asm/cacheflush.h>
19 #include <asm/tlbflush.h> 19 #include <asm/tlbflush.h>
20 #include <asm/page.h> 20 #include <asm/page.h>
21 #include <linux/memcontrol.h> 21 #include <linux/memcontrol.h>
22 #include <trace/events/kmem.h> 22 #include <trace/events/kmem.h>
23 23
24 #include "slab.h" 24 #include "slab.h"
25 25
26 enum slab_state slab_state; 26 enum slab_state slab_state;
27 LIST_HEAD(slab_caches); 27 LIST_HEAD(slab_caches);
28 DEFINE_MUTEX(slab_mutex); 28 DEFINE_MUTEX(slab_mutex);
29 struct kmem_cache *kmem_cache; 29 struct kmem_cache *kmem_cache;
30 30
31 #ifdef CONFIG_DEBUG_VM 31 #ifdef CONFIG_DEBUG_VM
32 static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name, 32 static int kmem_cache_sanity_check(const char *name, size_t size)
33 size_t size)
34 { 33 {
35 struct kmem_cache *s = NULL; 34 struct kmem_cache *s = NULL;
36 35
37 if (!name || in_interrupt() || size < sizeof(void *) || 36 if (!name || in_interrupt() || size < sizeof(void *) ||
38 size > KMALLOC_MAX_SIZE) { 37 size > KMALLOC_MAX_SIZE) {
39 pr_err("kmem_cache_create(%s) integrity check failed\n", name); 38 pr_err("kmem_cache_create(%s) integrity check failed\n", name);
40 return -EINVAL; 39 return -EINVAL;
41 } 40 }
42 41
43 list_for_each_entry(s, &slab_caches, list) { 42 list_for_each_entry(s, &slab_caches, list) {
44 char tmp; 43 char tmp;
45 int res; 44 int res;
46 45
47 /* 46 /*
48 * This happens when the module gets unloaded and doesn't 47 * This happens when the module gets unloaded and doesn't
49 * destroy its slab cache and no-one else reuses the vmalloc 48 * destroy its slab cache and no-one else reuses the vmalloc
50 * area of the module. Print a warning. 49 * area of the module. Print a warning.
51 */ 50 */
52 res = probe_kernel_address(s->name, tmp); 51 res = probe_kernel_address(s->name, tmp);
53 if (res) { 52 if (res) {
54 pr_err("Slab cache with size %d has lost its name\n", 53 pr_err("Slab cache with size %d has lost its name\n",
55 s->object_size); 54 s->object_size);
56 continue; 55 continue;
57 } 56 }
58 57
59 #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON) 58 #if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
60 /* 59 if (!strcmp(s->name, name)) {
61 * For simplicity, we won't check this in the list of memcg
62 * caches. We have control over memcg naming, and if there
63 * aren't duplicates in the global list, there won't be any
64 * duplicates in the memcg lists as well.
65 */
66 if (!memcg && !strcmp(s->name, name)) {
67 pr_err("%s (%s): Cache name already exists.\n", 60 pr_err("%s (%s): Cache name already exists.\n",
68 __func__, name); 61 __func__, name);
69 dump_stack(); 62 dump_stack();
70 s = NULL; 63 s = NULL;
71 return -EINVAL; 64 return -EINVAL;
72 } 65 }
73 #endif 66 #endif
74 } 67 }
75 68
76 WARN_ON(strchr(name, ' ')); /* It confuses parsers */ 69 WARN_ON(strchr(name, ' ')); /* It confuses parsers */
77 return 0; 70 return 0;
78 } 71 }
79 #else 72 #else
80 static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg, 73 static inline int kmem_cache_sanity_check(const char *name, size_t size)
81 const char *name, size_t size)
82 { 74 {
83 return 0; 75 return 0;
84 } 76 }
85 #endif 77 #endif
86 78
87 #ifdef CONFIG_MEMCG_KMEM 79 #ifdef CONFIG_MEMCG_KMEM
88 int memcg_update_all_caches(int num_memcgs) 80 int memcg_update_all_caches(int num_memcgs)
89 { 81 {
90 struct kmem_cache *s; 82 struct kmem_cache *s;
91 int ret = 0; 83 int ret = 0;
92 mutex_lock(&slab_mutex); 84 mutex_lock(&slab_mutex);
93 85
94 list_for_each_entry(s, &slab_caches, list) { 86 list_for_each_entry(s, &slab_caches, list) {
95 if (!is_root_cache(s)) 87 if (!is_root_cache(s))
96 continue; 88 continue;
97 89
98 ret = memcg_update_cache_size(s, num_memcgs); 90 ret = memcg_update_cache_size(s, num_memcgs);
99 /* 91 /*
100 * See comment in memcontrol.c, memcg_update_cache_size: 92 * See comment in memcontrol.c, memcg_update_cache_size:
101 * Instead of freeing the memory, we'll just leave the caches 93 * Instead of freeing the memory, we'll just leave the caches
102 * up to this point in an updated state. 94 * up to this point in an updated state.
103 */ 95 */
104 if (ret) 96 if (ret)
105 goto out; 97 goto out;
106 } 98 }
107 99
108 memcg_update_array_size(num_memcgs); 100 memcg_update_array_size(num_memcgs);
109 out: 101 out:
110 mutex_unlock(&slab_mutex); 102 mutex_unlock(&slab_mutex);
111 return ret; 103 return ret;
112 } 104 }
113 #endif 105 #endif
114 106
115 /* 107 /*
116 * Figure out what the alignment of the objects will be given a set of 108 * Figure out what the alignment of the objects will be given a set of
117 * flags, a user specified alignment and the size of the objects. 109 * flags, a user specified alignment and the size of the objects.
118 */ 110 */
119 unsigned long calculate_alignment(unsigned long flags, 111 unsigned long calculate_alignment(unsigned long flags,
120 unsigned long align, unsigned long size) 112 unsigned long align, unsigned long size)
121 { 113 {
122 /* 114 /*
123 * If the user wants hardware cache aligned objects then follow that 115 * If the user wants hardware cache aligned objects then follow that
124 * suggestion if the object is sufficiently large. 116 * suggestion if the object is sufficiently large.
125 * 117 *
126 * The hardware cache alignment cannot override the specified 118 * The hardware cache alignment cannot override the specified
127 * alignment though. If that is greater then use it. 119 * alignment though. If that is greater then use it.
128 */ 120 */
129 if (flags & SLAB_HWCACHE_ALIGN) { 121 if (flags & SLAB_HWCACHE_ALIGN) {
130 unsigned long ralign = cache_line_size(); 122 unsigned long ralign = cache_line_size();
131 while (size <= ralign / 2) 123 while (size <= ralign / 2)
132 ralign /= 2; 124 ralign /= 2;
133 align = max(align, ralign); 125 align = max(align, ralign);
134 } 126 }
135 127
136 if (align < ARCH_SLAB_MINALIGN) 128 if (align < ARCH_SLAB_MINALIGN)
137 align = ARCH_SLAB_MINALIGN; 129 align = ARCH_SLAB_MINALIGN;
138 130
139 return ALIGN(align, sizeof(void *)); 131 return ALIGN(align, sizeof(void *));
140 } 132 }
141 133
134 static struct kmem_cache *
135 do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
136 unsigned long flags, void (*ctor)(void *),
137 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
138 {
139 struct kmem_cache *s;
140 int err;
142 141
142 err = -ENOMEM;
143 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
144 if (!s)
145 goto out;
146
147 s->name = name;
148 s->object_size = object_size;
149 s->size = size;
150 s->align = align;
151 s->ctor = ctor;
152
153 err = memcg_alloc_cache_params(memcg, s, root_cache);
154 if (err)
155 goto out_free_cache;
156
157 err = __kmem_cache_create(s, flags);
158 if (err)
159 goto out_free_cache;
160
161 s->refcount = 1;
162 list_add(&s->list, &slab_caches);
163 memcg_register_cache(s);
164 out:
165 if (err)
166 return ERR_PTR(err);
167 return s;
168
169 out_free_cache:
170 memcg_free_cache_params(s);
171 kfree(s);
172 goto out;
173 }
174
143 /* 175 /*
144 * kmem_cache_create - Create a cache. 176 * kmem_cache_create - Create a cache.
145 * @name: A string which is used in /proc/slabinfo to identify this cache. 177 * @name: A string which is used in /proc/slabinfo to identify this cache.
146 * @size: The size of objects to be created in this cache. 178 * @size: The size of objects to be created in this cache.
147 * @align: The required alignment for the objects. 179 * @align: The required alignment for the objects.
148 * @flags: SLAB flags 180 * @flags: SLAB flags
149 * @ctor: A constructor for the objects. 181 * @ctor: A constructor for the objects.
150 * 182 *
151 * Returns a ptr to the cache on success, NULL on failure. 183 * Returns a ptr to the cache on success, NULL on failure.
152 * Cannot be called within a interrupt, but can be interrupted. 184 * Cannot be called within a interrupt, but can be interrupted.
153 * The @ctor is run when new pages are allocated by the cache. 185 * The @ctor is run when new pages are allocated by the cache.
154 * 186 *
155 * The flags are 187 * The flags are
156 * 188 *
157 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) 189 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
158 * to catch references to uninitialised memory. 190 * to catch references to uninitialised memory.
159 * 191 *
160 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check 192 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
161 * for buffer overruns. 193 * for buffer overruns.
162 * 194 *
163 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware 195 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
164 * cacheline. This can be beneficial if you're counting cycles as closely 196 * cacheline. This can be beneficial if you're counting cycles as closely
165 * as davem. 197 * as davem.
166 */ 198 */
167
168 struct kmem_cache * 199 struct kmem_cache *
169 kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size, 200 kmem_cache_create(const char *name, size_t size, size_t align,
170 size_t align, unsigned long flags, void (*ctor)(void *), 201 unsigned long flags, void (*ctor)(void *))
171 struct kmem_cache *parent_cache)
172 { 202 {
173 struct kmem_cache *s = NULL; 203 struct kmem_cache *s;
204 char *cache_name;
174 int err; 205 int err;
175 206
176 get_online_cpus(); 207 get_online_cpus();
177 mutex_lock(&slab_mutex); 208 mutex_lock(&slab_mutex);
178 209
179 err = kmem_cache_sanity_check(memcg, name, size); 210 err = kmem_cache_sanity_check(name, size);
180 if (err) 211 if (err)
181 goto out_unlock; 212 goto out_unlock;
182 213
183 if (memcg) {
184 /*
185 * Since per-memcg caches are created asynchronously on first
186 * allocation (see memcg_kmem_get_cache()), several threads can
187 * try to create the same cache, but only one of them may
188 * succeed. Therefore if we get here and see the cache has
189 * already been created, we silently return NULL.
190 */
191 if (cache_from_memcg_idx(parent_cache, memcg_cache_id(memcg)))
192 goto out_unlock;
193 }
194
195 /* 214 /*
196 * Some allocators will constraint the set of valid flags to a subset 215 * Some allocators will constraint the set of valid flags to a subset
197 * of all flags. We expect them to define CACHE_CREATE_MASK in this 216 * of all flags. We expect them to define CACHE_CREATE_MASK in this
198 * case, and we'll just provide them with a sanitized version of the 217 * case, and we'll just provide them with a sanitized version of the
199 * passed flags. 218 * passed flags.
200 */ 219 */
201 flags &= CACHE_CREATE_MASK; 220 flags &= CACHE_CREATE_MASK;
202 221
203 if (!memcg) { 222 s = __kmem_cache_alias(name, size, align, flags, ctor);
204 s = __kmem_cache_alias(name, size, align, flags, ctor); 223 if (s)
205 if (s) 224 goto out_unlock;
206 goto out_unlock;
207 }
208 225
209 err = -ENOMEM; 226 cache_name = kstrdup(name, GFP_KERNEL);
210 s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL); 227 if (!cache_name) {
211 if (!s) 228 err = -ENOMEM;
212 goto out_unlock; 229 goto out_unlock;
230 }
213 231
214 s->object_size = s->size = size; 232 s = do_kmem_cache_create(cache_name, size, size,
215 s->align = calculate_alignment(flags, align, size); 233 calculate_alignment(flags, align, size),
216 s->ctor = ctor; 234 flags, ctor, NULL, NULL);
235 if (IS_ERR(s)) {
236 err = PTR_ERR(s);
237 kfree(cache_name);
238 }
217 239
218 if (memcg)
219 s->name = memcg_create_cache_name(memcg, parent_cache);
220 else
221 s->name = kstrdup(name, GFP_KERNEL);
222 if (!s->name)
223 goto out_free_cache;
224
225 err = memcg_alloc_cache_params(memcg, s, parent_cache);
226 if (err)
227 goto out_free_cache;
228
229 err = __kmem_cache_create(s, flags);
230 if (err)
231 goto out_free_cache;
232
233 s->refcount = 1;
234 list_add(&s->list, &slab_caches);
235 memcg_register_cache(s);
236
237 out_unlock: 240 out_unlock:
238 mutex_unlock(&slab_mutex); 241 mutex_unlock(&slab_mutex);
239 put_online_cpus(); 242 put_online_cpus();
240 243
241 if (err) { 244 if (err) {
242 /*
243 * There is no point in flooding logs with warnings or
244 * especially crashing the system if we fail to create a cache
245 * for a memcg. In this case we will be accounting the memcg
246 * allocation to the root cgroup until we succeed to create its
247 * own cache, but it isn't that critical.
248 */
249 if (!memcg)
250 return NULL;
251
252 if (flags & SLAB_PANIC) 245 if (flags & SLAB_PANIC)
253 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", 246 panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
254 name, err); 247 name, err);
255 else { 248 else {
256 printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", 249 printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d",
257 name, err); 250 name, err);
258 dump_stack(); 251 dump_stack();
259 } 252 }
260 return NULL; 253 return NULL;
261 } 254 }
262 return s; 255 return s;
263
264 out_free_cache:
265 memcg_free_cache_params(s);
266 kfree(s->name);
267 kmem_cache_free(kmem_cache, s);
268 goto out_unlock;
269 } 256 }
257 EXPORT_SYMBOL(kmem_cache_create);
270 258
271 struct kmem_cache * 259 #ifdef CONFIG_MEMCG_KMEM
272 kmem_cache_create(const char *name, size_t size, size_t align, 260 /*
273 unsigned long flags, void (*ctor)(void *)) 261 * kmem_cache_create_memcg - Create a cache for a memory cgroup.
262 * @memcg: The memory cgroup the new cache is for.
263 * @root_cache: The parent of the new cache.
264 *
265 * This function attempts to create a kmem cache that will serve allocation
266 * requests going from @memcg to @root_cache. The new cache inherits properties
267 * from its parent.
268 */
269 void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache)
274 { 270 {
275 return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL); 271 struct kmem_cache *s;
272 char *cache_name;
273
274 get_online_cpus();
275 mutex_lock(&slab_mutex);
276
277 /*
278 * Since per-memcg caches are created asynchronously on first
279 * allocation (see memcg_kmem_get_cache()), several threads can try to
280 * create the same cache, but only one of them may succeed.
281 */
282 if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg)))
283 goto out_unlock;
284
285 cache_name = memcg_create_cache_name(memcg, root_cache);
286 if (!cache_name)
287 goto out_unlock;
288
289 s = do_kmem_cache_create(cache_name, root_cache->object_size,
290 root_cache->size, root_cache->align,
291 root_cache->flags, root_cache->ctor,
292 memcg, root_cache);
293 if (IS_ERR(s)) {
294 kfree(cache_name);
295 goto out_unlock;
296 }
297
298 s->allocflags |= __GFP_KMEMCG;
299
300 out_unlock:
301 mutex_unlock(&slab_mutex);
302 put_online_cpus();
276 } 303 }
277 EXPORT_SYMBOL(kmem_cache_create); 304 #endif /* CONFIG_MEMCG_KMEM */
278 305
279 void kmem_cache_destroy(struct kmem_cache *s) 306 void kmem_cache_destroy(struct kmem_cache *s)
280 { 307 {
281 /* Destroy all the children caches if we aren't a memcg cache */ 308 /* Destroy all the children caches if we aren't a memcg cache */
282 kmem_cache_destroy_memcg_children(s); 309 kmem_cache_destroy_memcg_children(s);
283 310
284 get_online_cpus(); 311 get_online_cpus();
285 mutex_lock(&slab_mutex); 312 mutex_lock(&slab_mutex);
286 s->refcount--; 313 s->refcount--;
287 if (!s->refcount) { 314 if (!s->refcount) {
288 list_del(&s->list); 315 list_del(&s->list);
289 316
290 if (!__kmem_cache_shutdown(s)) { 317 if (!__kmem_cache_shutdown(s)) {
291 memcg_unregister_cache(s); 318 memcg_unregister_cache(s);
292 mutex_unlock(&slab_mutex); 319 mutex_unlock(&slab_mutex);
293 if (s->flags & SLAB_DESTROY_BY_RCU) 320 if (s->flags & SLAB_DESTROY_BY_RCU)
294 rcu_barrier(); 321 rcu_barrier();
295 322
296 memcg_free_cache_params(s); 323 memcg_free_cache_params(s);
297 kfree(s->name); 324 kfree(s->name);
298 kmem_cache_free(kmem_cache, s); 325 kmem_cache_free(kmem_cache, s);
299 } else { 326 } else {
300 list_add(&s->list, &slab_caches); 327 list_add(&s->list, &slab_caches);
301 mutex_unlock(&slab_mutex); 328 mutex_unlock(&slab_mutex);
302 printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n", 329 printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
303 s->name); 330 s->name);
304 dump_stack(); 331 dump_stack();
305 } 332 }
306 } else { 333 } else {
307 mutex_unlock(&slab_mutex); 334 mutex_unlock(&slab_mutex);
308 } 335 }
309 put_online_cpus(); 336 put_online_cpus();
310 } 337 }
311 EXPORT_SYMBOL(kmem_cache_destroy); 338 EXPORT_SYMBOL(kmem_cache_destroy);
312 339
313 int slab_is_available(void) 340 int slab_is_available(void)
314 { 341 {
315 return slab_state >= UP; 342 return slab_state >= UP;
316 } 343 }
317 344
318 #ifndef CONFIG_SLOB 345 #ifndef CONFIG_SLOB
319 /* Create a cache during boot when no slab services are available yet */ 346 /* Create a cache during boot when no slab services are available yet */
320 void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, 347 void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size,
321 unsigned long flags) 348 unsigned long flags)
322 { 349 {
323 int err; 350 int err;
324 351
325 s->name = name; 352 s->name = name;
326 s->size = s->object_size = size; 353 s->size = s->object_size = size;
327 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); 354 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
328 err = __kmem_cache_create(s, flags); 355 err = __kmem_cache_create(s, flags);
329 356
330 if (err) 357 if (err)
331 panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", 358 panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n",
332 name, size, err); 359 name, size, err);
333 360
334 s->refcount = -1; /* Exempt from merging for now */ 361 s->refcount = -1; /* Exempt from merging for now */
335 } 362 }
336 363
337 struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, 364 struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
338 unsigned long flags) 365 unsigned long flags)
339 { 366 {
340 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); 367 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
341 368
342 if (!s) 369 if (!s)
343 panic("Out of memory when creating slab %s\n", name); 370 panic("Out of memory when creating slab %s\n", name);
344 371
345 create_boot_cache(s, name, size, flags); 372 create_boot_cache(s, name, size, flags);
346 list_add(&s->list, &slab_caches); 373 list_add(&s->list, &slab_caches);
347 s->refcount = 1; 374 s->refcount = 1;
348 return s; 375 return s;
349 } 376 }
350 377
351 struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; 378 struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
352 EXPORT_SYMBOL(kmalloc_caches); 379 EXPORT_SYMBOL(kmalloc_caches);
353 380
354 #ifdef CONFIG_ZONE_DMA 381 #ifdef CONFIG_ZONE_DMA
355 struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; 382 struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
356 EXPORT_SYMBOL(kmalloc_dma_caches); 383 EXPORT_SYMBOL(kmalloc_dma_caches);
357 #endif 384 #endif
358 385
359 /* 386 /*
360 * Conversion table for small slabs sizes / 8 to the index in the 387 * Conversion table for small slabs sizes / 8 to the index in the
361 * kmalloc array. This is necessary for slabs < 192 since we have non power 388 * kmalloc array. This is necessary for slabs < 192 since we have non power
362 * of two cache sizes there. The size of larger slabs can be determined using 389 * of two cache sizes there. The size of larger slabs can be determined using
363 * fls. 390 * fls.
364 */ 391 */
365 static s8 size_index[24] = { 392 static s8 size_index[24] = {
366 3, /* 8 */ 393 3, /* 8 */
367 4, /* 16 */ 394 4, /* 16 */
368 5, /* 24 */ 395 5, /* 24 */
369 5, /* 32 */ 396 5, /* 32 */
370 6, /* 40 */ 397 6, /* 40 */
371 6, /* 48 */ 398 6, /* 48 */
372 6, /* 56 */ 399 6, /* 56 */
373 6, /* 64 */ 400 6, /* 64 */
374 1, /* 72 */ 401 1, /* 72 */
375 1, /* 80 */ 402 1, /* 80 */
376 1, /* 88 */ 403 1, /* 88 */
377 1, /* 96 */ 404 1, /* 96 */
378 7, /* 104 */ 405 7, /* 104 */
379 7, /* 112 */ 406 7, /* 112 */
380 7, /* 120 */ 407 7, /* 120 */
381 7, /* 128 */ 408 7, /* 128 */
382 2, /* 136 */ 409 2, /* 136 */
383 2, /* 144 */ 410 2, /* 144 */
384 2, /* 152 */ 411 2, /* 152 */
385 2, /* 160 */ 412 2, /* 160 */
386 2, /* 168 */ 413 2, /* 168 */
387 2, /* 176 */ 414 2, /* 176 */
388 2, /* 184 */ 415 2, /* 184 */
389 2 /* 192 */ 416 2 /* 192 */
390 }; 417 };
391 418
392 static inline int size_index_elem(size_t bytes) 419 static inline int size_index_elem(size_t bytes)
393 { 420 {
394 return (bytes - 1) / 8; 421 return (bytes - 1) / 8;
395 } 422 }
396 423
397 /* 424 /*
398 * Find the kmem_cache structure that serves a given size of 425 * Find the kmem_cache structure that serves a given size of
399 * allocation 426 * allocation
400 */ 427 */
401 struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) 428 struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
402 { 429 {
403 int index; 430 int index;
404 431
405 if (unlikely(size > KMALLOC_MAX_SIZE)) { 432 if (unlikely(size > KMALLOC_MAX_SIZE)) {
406 WARN_ON_ONCE(!(flags & __GFP_NOWARN)); 433 WARN_ON_ONCE(!(flags & __GFP_NOWARN));
407 return NULL; 434 return NULL;
408 } 435 }
409 436
410 if (size <= 192) { 437 if (size <= 192) {
411 if (!size) 438 if (!size)
412 return ZERO_SIZE_PTR; 439 return ZERO_SIZE_PTR;
413 440
414 index = size_index[size_index_elem(size)]; 441 index = size_index[size_index_elem(size)];
415 } else 442 } else
416 index = fls(size - 1); 443 index = fls(size - 1);
417 444
418 #ifdef CONFIG_ZONE_DMA 445 #ifdef CONFIG_ZONE_DMA
419 if (unlikely((flags & GFP_DMA))) 446 if (unlikely((flags & GFP_DMA)))
420 return kmalloc_dma_caches[index]; 447 return kmalloc_dma_caches[index];
421 448
422 #endif 449 #endif
423 return kmalloc_caches[index]; 450 return kmalloc_caches[index];
424 } 451 }
425 452
426 /* 453 /*
427 * Create the kmalloc array. Some of the regular kmalloc arrays 454 * Create the kmalloc array. Some of the regular kmalloc arrays
428 * may already have been created because they were needed to 455 * may already have been created because they were needed to
429 * enable allocations for slab creation. 456 * enable allocations for slab creation.
430 */ 457 */
431 void __init create_kmalloc_caches(unsigned long flags) 458 void __init create_kmalloc_caches(unsigned long flags)
432 { 459 {
433 int i; 460 int i;
434 461
435 /* 462 /*
436 * Patch up the size_index table if we have strange large alignment 463 * Patch up the size_index table if we have strange large alignment
437 * requirements for the kmalloc array. This is only the case for 464 * requirements for the kmalloc array. This is only the case for
438 * MIPS it seems. The standard arches will not generate any code here. 465 * MIPS it seems. The standard arches will not generate any code here.
439 * 466 *
440 * Largest permitted alignment is 256 bytes due to the way we 467 * Largest permitted alignment is 256 bytes due to the way we
441 * handle the index determination for the smaller caches. 468 * handle the index determination for the smaller caches.
442 * 469 *
443 * Make sure that nothing crazy happens if someone starts tinkering 470 * Make sure that nothing crazy happens if someone starts tinkering
444 * around with ARCH_KMALLOC_MINALIGN 471 * around with ARCH_KMALLOC_MINALIGN
445 */ 472 */
446 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || 473 BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
447 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); 474 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
448 475
449 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { 476 for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
450 int elem = size_index_elem(i); 477 int elem = size_index_elem(i);
451 478
452 if (elem >= ARRAY_SIZE(size_index)) 479 if (elem >= ARRAY_SIZE(size_index))
453 break; 480 break;
454 size_index[elem] = KMALLOC_SHIFT_LOW; 481 size_index[elem] = KMALLOC_SHIFT_LOW;
455 } 482 }
456 483
457 if (KMALLOC_MIN_SIZE >= 64) { 484 if (KMALLOC_MIN_SIZE >= 64) {
458 /* 485 /*
459 * The 96 byte size cache is not used if the alignment 486 * The 96 byte size cache is not used if the alignment
460 * is 64 byte. 487 * is 64 byte.
461 */ 488 */
462 for (i = 64 + 8; i <= 96; i += 8) 489 for (i = 64 + 8; i <= 96; i += 8)
463 size_index[size_index_elem(i)] = 7; 490 size_index[size_index_elem(i)] = 7;
464 491
465 } 492 }
466 493
467 if (KMALLOC_MIN_SIZE >= 128) { 494 if (KMALLOC_MIN_SIZE >= 128) {
468 /* 495 /*
469 * The 192 byte sized cache is not used if the alignment 496 * The 192 byte sized cache is not used if the alignment
470 * is 128 byte. Redirect kmalloc to use the 256 byte cache 497 * is 128 byte. Redirect kmalloc to use the 256 byte cache
471 * instead. 498 * instead.
472 */ 499 */
473 for (i = 128 + 8; i <= 192; i += 8) 500 for (i = 128 + 8; i <= 192; i += 8)
474 size_index[size_index_elem(i)] = 8; 501 size_index[size_index_elem(i)] = 8;
475 } 502 }
476 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { 503 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
477 if (!kmalloc_caches[i]) { 504 if (!kmalloc_caches[i]) {
478 kmalloc_caches[i] = create_kmalloc_cache(NULL, 505 kmalloc_caches[i] = create_kmalloc_cache(NULL,
479 1 << i, flags); 506 1 << i, flags);
480 } 507 }
481 508
482 /* 509 /*
483 * Caches that are not of the two-to-the-power-of size. 510 * Caches that are not of the two-to-the-power-of size.
484 * These have to be created immediately after the 511 * These have to be created immediately after the
485 * earlier power of two caches 512 * earlier power of two caches
486 */ 513 */
487 if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6) 514 if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
488 kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags); 515 kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
489 516
490 if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7) 517 if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
491 kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags); 518 kmalloc_caches[2] = create_kmalloc_cache(NULL, 192, flags);
492 } 519 }
493 520
494 /* Kmalloc array is now usable */ 521 /* Kmalloc array is now usable */
495 slab_state = UP; 522 slab_state = UP;
496 523
497 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { 524 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
498 struct kmem_cache *s = kmalloc_caches[i]; 525 struct kmem_cache *s = kmalloc_caches[i];
499 char *n; 526 char *n;
500 527
501 if (s) { 528 if (s) {
502 n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i)); 529 n = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
503 530
504 BUG_ON(!n); 531 BUG_ON(!n);
505 s->name = n; 532 s->name = n;
506 } 533 }
507 } 534 }
508 535
509 #ifdef CONFIG_ZONE_DMA 536 #ifdef CONFIG_ZONE_DMA
510 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { 537 for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
511 struct kmem_cache *s = kmalloc_caches[i]; 538 struct kmem_cache *s = kmalloc_caches[i];
512 539
513 if (s) { 540 if (s) {
514 int size = kmalloc_size(i); 541 int size = kmalloc_size(i);
515 char *n = kasprintf(GFP_NOWAIT, 542 char *n = kasprintf(GFP_NOWAIT,
516 "dma-kmalloc-%d", size); 543 "dma-kmalloc-%d", size);
517 544
518 BUG_ON(!n); 545 BUG_ON(!n);
519 kmalloc_dma_caches[i] = create_kmalloc_cache(n, 546 kmalloc_dma_caches[i] = create_kmalloc_cache(n,
520 size, SLAB_CACHE_DMA | flags); 547 size, SLAB_CACHE_DMA | flags);
521 } 548 }
522 } 549 }
523 #endif 550 #endif
524 } 551 }
525 #endif /* !CONFIG_SLOB */ 552 #endif /* !CONFIG_SLOB */
526 553
527 #ifdef CONFIG_TRACING 554 #ifdef CONFIG_TRACING
528 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) 555 void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
529 { 556 {
530 void *ret = kmalloc_order(size, flags, order); 557 void *ret = kmalloc_order(size, flags, order);
531 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); 558 trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
532 return ret; 559 return ret;
533 } 560 }
534 EXPORT_SYMBOL(kmalloc_order_trace); 561 EXPORT_SYMBOL(kmalloc_order_trace);
535 #endif 562 #endif
536 563
537 #ifdef CONFIG_SLABINFO 564 #ifdef CONFIG_SLABINFO
538 565
539 #ifdef CONFIG_SLAB 566 #ifdef CONFIG_SLAB
540 #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) 567 #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR)
541 #else 568 #else
542 #define SLABINFO_RIGHTS S_IRUSR 569 #define SLABINFO_RIGHTS S_IRUSR
543 #endif 570 #endif
544 571
545 void print_slabinfo_header(struct seq_file *m) 572 void print_slabinfo_header(struct seq_file *m)
546 { 573 {
547 /* 574 /*
548 * Output format version, so at least we can change it 575 * Output format version, so at least we can change it
549 * without _too_ many complaints. 576 * without _too_ many complaints.
550 */ 577 */
551 #ifdef CONFIG_DEBUG_SLAB 578 #ifdef CONFIG_DEBUG_SLAB
552 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n"); 579 seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
553 #else 580 #else
554 seq_puts(m, "slabinfo - version: 2.1\n"); 581 seq_puts(m, "slabinfo - version: 2.1\n");
555 #endif 582 #endif
556 seq_puts(m, "# name <active_objs> <num_objs> <objsize> " 583 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
557 "<objperslab> <pagesperslab>"); 584 "<objperslab> <pagesperslab>");
558 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); 585 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
559 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 586 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
560 #ifdef CONFIG_DEBUG_SLAB 587 #ifdef CONFIG_DEBUG_SLAB
561 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " 588 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
562 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); 589 "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>");
563 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 590 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
564 #endif 591 #endif
565 seq_putc(m, '\n'); 592 seq_putc(m, '\n');
566 } 593 }
567 594
568 static void *s_start(struct seq_file *m, loff_t *pos) 595 static void *s_start(struct seq_file *m, loff_t *pos)
569 { 596 {
570 loff_t n = *pos; 597 loff_t n = *pos;
571 598
572 mutex_lock(&slab_mutex); 599 mutex_lock(&slab_mutex);
573 if (!n) 600 if (!n)
574 print_slabinfo_header(m); 601 print_slabinfo_header(m);
575 602
576 return seq_list_start(&slab_caches, *pos); 603 return seq_list_start(&slab_caches, *pos);
577 } 604 }
578 605
579 void *slab_next(struct seq_file *m, void *p, loff_t *pos) 606 void *slab_next(struct seq_file *m, void *p, loff_t *pos)
580 { 607 {
581 return seq_list_next(p, &slab_caches, pos); 608 return seq_list_next(p, &slab_caches, pos);
582 } 609 }
583 610
584 void slab_stop(struct seq_file *m, void *p) 611 void slab_stop(struct seq_file *m, void *p)
585 { 612 {
586 mutex_unlock(&slab_mutex); 613 mutex_unlock(&slab_mutex);
587 } 614 }
588 615
589 static void 616 static void
590 memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) 617 memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
591 { 618 {
592 struct kmem_cache *c; 619 struct kmem_cache *c;
593 struct slabinfo sinfo; 620 struct slabinfo sinfo;
594 int i; 621 int i;
595 622
596 if (!is_root_cache(s)) 623 if (!is_root_cache(s))
597 return; 624 return;
598 625
599 for_each_memcg_cache_index(i) { 626 for_each_memcg_cache_index(i) {
600 c = cache_from_memcg_idx(s, i); 627 c = cache_from_memcg_idx(s, i);
601 if (!c) 628 if (!c)
602 continue; 629 continue;
603 630
604 memset(&sinfo, 0, sizeof(sinfo)); 631 memset(&sinfo, 0, sizeof(sinfo));
605 get_slabinfo(c, &sinfo); 632 get_slabinfo(c, &sinfo);
606 633
607 info->active_slabs += sinfo.active_slabs; 634 info->active_slabs += sinfo.active_slabs;
608 info->num_slabs += sinfo.num_slabs; 635 info->num_slabs += sinfo.num_slabs;
609 info->shared_avail += sinfo.shared_avail; 636 info->shared_avail += sinfo.shared_avail;
610 info->active_objs += sinfo.active_objs; 637 info->active_objs += sinfo.active_objs;
611 info->num_objs += sinfo.num_objs; 638 info->num_objs += sinfo.num_objs;
612 } 639 }
613 } 640 }
614 641
615 int cache_show(struct kmem_cache *s, struct seq_file *m) 642 int cache_show(struct kmem_cache *s, struct seq_file *m)
616 { 643 {
617 struct slabinfo sinfo; 644 struct slabinfo sinfo;
618 645
619 memset(&sinfo, 0, sizeof(sinfo)); 646 memset(&sinfo, 0, sizeof(sinfo));
620 get_slabinfo(s, &sinfo); 647 get_slabinfo(s, &sinfo);
621 648
622 memcg_accumulate_slabinfo(s, &sinfo); 649 memcg_accumulate_slabinfo(s, &sinfo);
623 650
624 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", 651 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
625 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size, 652 cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
626 sinfo.objects_per_slab, (1 << sinfo.cache_order)); 653 sinfo.objects_per_slab, (1 << sinfo.cache_order));
627 654
628 seq_printf(m, " : tunables %4u %4u %4u", 655 seq_printf(m, " : tunables %4u %4u %4u",