Commit 8521fc50d433507a7cdc96bec280f9e5888a54cc

Authored by Michal Hocko
Committed by Linus Torvalds
1 parent 3e92041d68

memcg: get rid of percpu_charge_mutex lock

percpu_charge_mutex protects from multiple simultaneous per-cpu charge
caches draining because we might end up having too many work items.  At
least this was the case until commit 26fe61684449 ("memcg: fix percpu
cached charge draining frequency") when we introduced a more targeted
draining for async mode.

Now that also sync draining is targeted we can safely remove mutex
because we will not send more work than the current number of CPUs.
FLUSHING_CACHED_CHARGE protects from sending the same work multiple
times and stock->nr_pages == 0 protects from pointless sending a work if
there is obviously nothing to be done.  This is of course racy but we
can live with it as the race window is really small (we would have to
see FLUSHING_CACHED_CHARGE cleared while nr_pages would be still
non-zero).

The only remaining place where we can race is synchronous mode when we
rely on FLUSHING_CACHED_CHARGE test which might have been set by other
drainer on the same group but we should wait in that case as well.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 2 additions and 10 deletions Inline Diff

1 /* memcontrol.c - Memory Controller 1 /* memcontrol.c - Memory Controller
2 * 2 *
3 * Copyright IBM Corporation, 2007 3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 * 5 *
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds 9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or 15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version. 16 * (at your option) any later version.
17 * 17 *
18 * This program is distributed in the hope that it will be useful, 18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details. 21 * GNU General Public License for more details.
22 */ 22 */
23 23
24 #include <linux/res_counter.h> 24 #include <linux/res_counter.h>
25 #include <linux/memcontrol.h> 25 #include <linux/memcontrol.h>
26 #include <linux/cgroup.h> 26 #include <linux/cgroup.h>
27 #include <linux/mm.h> 27 #include <linux/mm.h>
28 #include <linux/hugetlb.h> 28 #include <linux/hugetlb.h>
29 #include <linux/pagemap.h> 29 #include <linux/pagemap.h>
30 #include <linux/smp.h> 30 #include <linux/smp.h>
31 #include <linux/page-flags.h> 31 #include <linux/page-flags.h>
32 #include <linux/backing-dev.h> 32 #include <linux/backing-dev.h>
33 #include <linux/bit_spinlock.h> 33 #include <linux/bit_spinlock.h>
34 #include <linux/rcupdate.h> 34 #include <linux/rcupdate.h>
35 #include <linux/limits.h> 35 #include <linux/limits.h>
36 #include <linux/mutex.h> 36 #include <linux/mutex.h>
37 #include <linux/rbtree.h> 37 #include <linux/rbtree.h>
38 #include <linux/shmem_fs.h> 38 #include <linux/shmem_fs.h>
39 #include <linux/slab.h> 39 #include <linux/slab.h>
40 #include <linux/swap.h> 40 #include <linux/swap.h>
41 #include <linux/swapops.h> 41 #include <linux/swapops.h>
42 #include <linux/spinlock.h> 42 #include <linux/spinlock.h>
43 #include <linux/eventfd.h> 43 #include <linux/eventfd.h>
44 #include <linux/sort.h> 44 #include <linux/sort.h>
45 #include <linux/fs.h> 45 #include <linux/fs.h>
46 #include <linux/seq_file.h> 46 #include <linux/seq_file.h>
47 #include <linux/vmalloc.h> 47 #include <linux/vmalloc.h>
48 #include <linux/mm_inline.h> 48 #include <linux/mm_inline.h>
49 #include <linux/page_cgroup.h> 49 #include <linux/page_cgroup.h>
50 #include <linux/cpu.h> 50 #include <linux/cpu.h>
51 #include <linux/oom.h> 51 #include <linux/oom.h>
52 #include "internal.h" 52 #include "internal.h"
53 53
54 #include <asm/uaccess.h> 54 #include <asm/uaccess.h>
55 55
56 #include <trace/events/vmscan.h> 56 #include <trace/events/vmscan.h>
57 57
58 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 58 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
59 #define MEM_CGROUP_RECLAIM_RETRIES 5 59 #define MEM_CGROUP_RECLAIM_RETRIES 5
60 struct mem_cgroup *root_mem_cgroup __read_mostly; 60 struct mem_cgroup *root_mem_cgroup __read_mostly;
61 61
62 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 62 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
63 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 63 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
64 int do_swap_account __read_mostly; 64 int do_swap_account __read_mostly;
65 65
66 /* for remember boot option*/ 66 /* for remember boot option*/
67 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 67 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
68 static int really_do_swap_account __initdata = 1; 68 static int really_do_swap_account __initdata = 1;
69 #else 69 #else
70 static int really_do_swap_account __initdata = 0; 70 static int really_do_swap_account __initdata = 0;
71 #endif 71 #endif
72 72
73 #else 73 #else
74 #define do_swap_account (0) 74 #define do_swap_account (0)
75 #endif 75 #endif
76 76
77 77
78 /* 78 /*
79 * Statistics for memory cgroup. 79 * Statistics for memory cgroup.
80 */ 80 */
81 enum mem_cgroup_stat_index { 81 enum mem_cgroup_stat_index {
82 /* 82 /*
83 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 83 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
84 */ 84 */
85 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 85 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
86 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 86 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
87 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 87 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 88 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
89 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 89 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
90 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 90 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
91 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
92 }; 92 };
93 93
94 enum mem_cgroup_events_index { 94 enum mem_cgroup_events_index {
95 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 95 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
96 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 96 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
97 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 97 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
98 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 98 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
99 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 99 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
100 MEM_CGROUP_EVENTS_NSTATS, 100 MEM_CGROUP_EVENTS_NSTATS,
101 }; 101 };
102 /* 102 /*
103 * Per memcg event counter is incremented at every pagein/pageout. With THP, 103 * Per memcg event counter is incremented at every pagein/pageout. With THP,
104 * it will be incremated by the number of pages. This counter is used for 104 * it will be incremated by the number of pages. This counter is used for
105 * for trigger some periodic events. This is straightforward and better 105 * for trigger some periodic events. This is straightforward and better
106 * than using jiffies etc. to handle periodic memcg event. 106 * than using jiffies etc. to handle periodic memcg event.
107 */ 107 */
108 enum mem_cgroup_events_target { 108 enum mem_cgroup_events_target {
109 MEM_CGROUP_TARGET_THRESH, 109 MEM_CGROUP_TARGET_THRESH,
110 MEM_CGROUP_TARGET_SOFTLIMIT, 110 MEM_CGROUP_TARGET_SOFTLIMIT,
111 MEM_CGROUP_TARGET_NUMAINFO, 111 MEM_CGROUP_TARGET_NUMAINFO,
112 MEM_CGROUP_NTARGETS, 112 MEM_CGROUP_NTARGETS,
113 }; 113 };
114 #define THRESHOLDS_EVENTS_TARGET (128) 114 #define THRESHOLDS_EVENTS_TARGET (128)
115 #define SOFTLIMIT_EVENTS_TARGET (1024) 115 #define SOFTLIMIT_EVENTS_TARGET (1024)
116 #define NUMAINFO_EVENTS_TARGET (1024) 116 #define NUMAINFO_EVENTS_TARGET (1024)
117 117
118 struct mem_cgroup_stat_cpu { 118 struct mem_cgroup_stat_cpu {
119 long count[MEM_CGROUP_STAT_NSTATS]; 119 long count[MEM_CGROUP_STAT_NSTATS];
120 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 120 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
121 unsigned long targets[MEM_CGROUP_NTARGETS]; 121 unsigned long targets[MEM_CGROUP_NTARGETS];
122 }; 122 };
123 123
124 /* 124 /*
125 * per-zone information in memory controller. 125 * per-zone information in memory controller.
126 */ 126 */
127 struct mem_cgroup_per_zone { 127 struct mem_cgroup_per_zone {
128 /* 128 /*
129 * spin_lock to protect the per cgroup LRU 129 * spin_lock to protect the per cgroup LRU
130 */ 130 */
131 struct list_head lists[NR_LRU_LISTS]; 131 struct list_head lists[NR_LRU_LISTS];
132 unsigned long count[NR_LRU_LISTS]; 132 unsigned long count[NR_LRU_LISTS];
133 133
134 struct zone_reclaim_stat reclaim_stat; 134 struct zone_reclaim_stat reclaim_stat;
135 struct rb_node tree_node; /* RB tree node */ 135 struct rb_node tree_node; /* RB tree node */
136 unsigned long long usage_in_excess;/* Set to the value by which */ 136 unsigned long long usage_in_excess;/* Set to the value by which */
137 /* the soft limit is exceeded*/ 137 /* the soft limit is exceeded*/
138 bool on_tree; 138 bool on_tree;
139 struct mem_cgroup *mem; /* Back pointer, we cannot */ 139 struct mem_cgroup *mem; /* Back pointer, we cannot */
140 /* use container_of */ 140 /* use container_of */
141 }; 141 };
142 /* Macro for accessing counter */ 142 /* Macro for accessing counter */
143 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 143 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
144 144
145 struct mem_cgroup_per_node { 145 struct mem_cgroup_per_node {
146 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 146 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
147 }; 147 };
148 148
149 struct mem_cgroup_lru_info { 149 struct mem_cgroup_lru_info {
150 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 150 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
151 }; 151 };
152 152
153 /* 153 /*
154 * Cgroups above their limits are maintained in a RB-Tree, independent of 154 * Cgroups above their limits are maintained in a RB-Tree, independent of
155 * their hierarchy representation 155 * their hierarchy representation
156 */ 156 */
157 157
158 struct mem_cgroup_tree_per_zone { 158 struct mem_cgroup_tree_per_zone {
159 struct rb_root rb_root; 159 struct rb_root rb_root;
160 spinlock_t lock; 160 spinlock_t lock;
161 }; 161 };
162 162
163 struct mem_cgroup_tree_per_node { 163 struct mem_cgroup_tree_per_node {
164 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 164 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
165 }; 165 };
166 166
167 struct mem_cgroup_tree { 167 struct mem_cgroup_tree {
168 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 168 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
169 }; 169 };
170 170
171 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 171 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
172 172
173 struct mem_cgroup_threshold { 173 struct mem_cgroup_threshold {
174 struct eventfd_ctx *eventfd; 174 struct eventfd_ctx *eventfd;
175 u64 threshold; 175 u64 threshold;
176 }; 176 };
177 177
178 /* For threshold */ 178 /* For threshold */
179 struct mem_cgroup_threshold_ary { 179 struct mem_cgroup_threshold_ary {
180 /* An array index points to threshold just below usage. */ 180 /* An array index points to threshold just below usage. */
181 int current_threshold; 181 int current_threshold;
182 /* Size of entries[] */ 182 /* Size of entries[] */
183 unsigned int size; 183 unsigned int size;
184 /* Array of thresholds */ 184 /* Array of thresholds */
185 struct mem_cgroup_threshold entries[0]; 185 struct mem_cgroup_threshold entries[0];
186 }; 186 };
187 187
188 struct mem_cgroup_thresholds { 188 struct mem_cgroup_thresholds {
189 /* Primary thresholds array */ 189 /* Primary thresholds array */
190 struct mem_cgroup_threshold_ary *primary; 190 struct mem_cgroup_threshold_ary *primary;
191 /* 191 /*
192 * Spare threshold array. 192 * Spare threshold array.
193 * This is needed to make mem_cgroup_unregister_event() "never fail". 193 * This is needed to make mem_cgroup_unregister_event() "never fail".
194 * It must be able to store at least primary->size - 1 entries. 194 * It must be able to store at least primary->size - 1 entries.
195 */ 195 */
196 struct mem_cgroup_threshold_ary *spare; 196 struct mem_cgroup_threshold_ary *spare;
197 }; 197 };
198 198
199 /* for OOM */ 199 /* for OOM */
200 struct mem_cgroup_eventfd_list { 200 struct mem_cgroup_eventfd_list {
201 struct list_head list; 201 struct list_head list;
202 struct eventfd_ctx *eventfd; 202 struct eventfd_ctx *eventfd;
203 }; 203 };
204 204
205 static void mem_cgroup_threshold(struct mem_cgroup *mem); 205 static void mem_cgroup_threshold(struct mem_cgroup *mem);
206 static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 206 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
207 207
208 enum { 208 enum {
209 SCAN_BY_LIMIT, 209 SCAN_BY_LIMIT,
210 SCAN_BY_SYSTEM, 210 SCAN_BY_SYSTEM,
211 NR_SCAN_CONTEXT, 211 NR_SCAN_CONTEXT,
212 SCAN_BY_SHRINK, /* not recorded now */ 212 SCAN_BY_SHRINK, /* not recorded now */
213 }; 213 };
214 214
215 enum { 215 enum {
216 SCAN, 216 SCAN,
217 SCAN_ANON, 217 SCAN_ANON,
218 SCAN_FILE, 218 SCAN_FILE,
219 ROTATE, 219 ROTATE,
220 ROTATE_ANON, 220 ROTATE_ANON,
221 ROTATE_FILE, 221 ROTATE_FILE,
222 FREED, 222 FREED,
223 FREED_ANON, 223 FREED_ANON,
224 FREED_FILE, 224 FREED_FILE,
225 ELAPSED, 225 ELAPSED,
226 NR_SCANSTATS, 226 NR_SCANSTATS,
227 }; 227 };
228 228
229 struct scanstat { 229 struct scanstat {
230 spinlock_t lock; 230 spinlock_t lock;
231 unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; 231 unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
232 unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; 232 unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
233 }; 233 };
234 234
235 const char *scanstat_string[NR_SCANSTATS] = { 235 const char *scanstat_string[NR_SCANSTATS] = {
236 "scanned_pages", 236 "scanned_pages",
237 "scanned_anon_pages", 237 "scanned_anon_pages",
238 "scanned_file_pages", 238 "scanned_file_pages",
239 "rotated_pages", 239 "rotated_pages",
240 "rotated_anon_pages", 240 "rotated_anon_pages",
241 "rotated_file_pages", 241 "rotated_file_pages",
242 "freed_pages", 242 "freed_pages",
243 "freed_anon_pages", 243 "freed_anon_pages",
244 "freed_file_pages", 244 "freed_file_pages",
245 "elapsed_ns", 245 "elapsed_ns",
246 }; 246 };
247 #define SCANSTAT_WORD_LIMIT "_by_limit" 247 #define SCANSTAT_WORD_LIMIT "_by_limit"
248 #define SCANSTAT_WORD_SYSTEM "_by_system" 248 #define SCANSTAT_WORD_SYSTEM "_by_system"
249 #define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" 249 #define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
250 250
251 251
252 /* 252 /*
253 * The memory controller data structure. The memory controller controls both 253 * The memory controller data structure. The memory controller controls both
254 * page cache and RSS per cgroup. We would eventually like to provide 254 * page cache and RSS per cgroup. We would eventually like to provide
255 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 255 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
256 * to help the administrator determine what knobs to tune. 256 * to help the administrator determine what knobs to tune.
257 * 257 *
258 * TODO: Add a water mark for the memory controller. Reclaim will begin when 258 * TODO: Add a water mark for the memory controller. Reclaim will begin when
259 * we hit the water mark. May be even add a low water mark, such that 259 * we hit the water mark. May be even add a low water mark, such that
260 * no reclaim occurs from a cgroup at it's low water mark, this is 260 * no reclaim occurs from a cgroup at it's low water mark, this is
261 * a feature that will be implemented much later in the future. 261 * a feature that will be implemented much later in the future.
262 */ 262 */
263 struct mem_cgroup { 263 struct mem_cgroup {
264 struct cgroup_subsys_state css; 264 struct cgroup_subsys_state css;
265 /* 265 /*
266 * the counter to account for memory usage 266 * the counter to account for memory usage
267 */ 267 */
268 struct res_counter res; 268 struct res_counter res;
269 /* 269 /*
270 * the counter to account for mem+swap usage. 270 * the counter to account for mem+swap usage.
271 */ 271 */
272 struct res_counter memsw; 272 struct res_counter memsw;
273 /* 273 /*
274 * Per cgroup active and inactive list, similar to the 274 * Per cgroup active and inactive list, similar to the
275 * per zone LRU lists. 275 * per zone LRU lists.
276 */ 276 */
277 struct mem_cgroup_lru_info info; 277 struct mem_cgroup_lru_info info;
278 /* 278 /*
279 * While reclaiming in a hierarchy, we cache the last child we 279 * While reclaiming in a hierarchy, we cache the last child we
280 * reclaimed from. 280 * reclaimed from.
281 */ 281 */
282 int last_scanned_child; 282 int last_scanned_child;
283 int last_scanned_node; 283 int last_scanned_node;
284 #if MAX_NUMNODES > 1 284 #if MAX_NUMNODES > 1
285 nodemask_t scan_nodes; 285 nodemask_t scan_nodes;
286 atomic_t numainfo_events; 286 atomic_t numainfo_events;
287 atomic_t numainfo_updating; 287 atomic_t numainfo_updating;
288 #endif 288 #endif
289 /* 289 /*
290 * Should the accounting and control be hierarchical, per subtree? 290 * Should the accounting and control be hierarchical, per subtree?
291 */ 291 */
292 bool use_hierarchy; 292 bool use_hierarchy;
293 293
294 bool oom_lock; 294 bool oom_lock;
295 atomic_t under_oom; 295 atomic_t under_oom;
296 296
297 atomic_t refcnt; 297 atomic_t refcnt;
298 298
299 int swappiness; 299 int swappiness;
300 /* OOM-Killer disable */ 300 /* OOM-Killer disable */
301 int oom_kill_disable; 301 int oom_kill_disable;
302 302
303 /* set when res.limit == memsw.limit */ 303 /* set when res.limit == memsw.limit */
304 bool memsw_is_minimum; 304 bool memsw_is_minimum;
305 305
306 /* protect arrays of thresholds */ 306 /* protect arrays of thresholds */
307 struct mutex thresholds_lock; 307 struct mutex thresholds_lock;
308 308
309 /* thresholds for memory usage. RCU-protected */ 309 /* thresholds for memory usage. RCU-protected */
310 struct mem_cgroup_thresholds thresholds; 310 struct mem_cgroup_thresholds thresholds;
311 311
312 /* thresholds for mem+swap usage. RCU-protected */ 312 /* thresholds for mem+swap usage. RCU-protected */
313 struct mem_cgroup_thresholds memsw_thresholds; 313 struct mem_cgroup_thresholds memsw_thresholds;
314 314
315 /* For oom notifier event fd */ 315 /* For oom notifier event fd */
316 struct list_head oom_notify; 316 struct list_head oom_notify;
317 /* For recording LRU-scan statistics */ 317 /* For recording LRU-scan statistics */
318 struct scanstat scanstat; 318 struct scanstat scanstat;
319 /* 319 /*
320 * Should we move charges of a task when a task is moved into this 320 * Should we move charges of a task when a task is moved into this
321 * mem_cgroup ? And what type of charges should we move ? 321 * mem_cgroup ? And what type of charges should we move ?
322 */ 322 */
323 unsigned long move_charge_at_immigrate; 323 unsigned long move_charge_at_immigrate;
324 /* 324 /*
325 * percpu counter. 325 * percpu counter.
326 */ 326 */
327 struct mem_cgroup_stat_cpu *stat; 327 struct mem_cgroup_stat_cpu *stat;
328 /* 328 /*
329 * used when a cpu is offlined or other synchronizations 329 * used when a cpu is offlined or other synchronizations
330 * See mem_cgroup_read_stat(). 330 * See mem_cgroup_read_stat().
331 */ 331 */
332 struct mem_cgroup_stat_cpu nocpu_base; 332 struct mem_cgroup_stat_cpu nocpu_base;
333 spinlock_t pcp_counter_lock; 333 spinlock_t pcp_counter_lock;
334 }; 334 };
335 335
336 /* Stuffs for move charges at task migration. */ 336 /* Stuffs for move charges at task migration. */
337 /* 337 /*
338 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 338 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
339 * left-shifted bitmap of these types. 339 * left-shifted bitmap of these types.
340 */ 340 */
341 enum move_type { 341 enum move_type {
342 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 342 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
343 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 343 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
344 NR_MOVE_TYPE, 344 NR_MOVE_TYPE,
345 }; 345 };
346 346
347 /* "mc" and its members are protected by cgroup_mutex */ 347 /* "mc" and its members are protected by cgroup_mutex */
348 static struct move_charge_struct { 348 static struct move_charge_struct {
349 spinlock_t lock; /* for from, to */ 349 spinlock_t lock; /* for from, to */
350 struct mem_cgroup *from; 350 struct mem_cgroup *from;
351 struct mem_cgroup *to; 351 struct mem_cgroup *to;
352 unsigned long precharge; 352 unsigned long precharge;
353 unsigned long moved_charge; 353 unsigned long moved_charge;
354 unsigned long moved_swap; 354 unsigned long moved_swap;
355 struct task_struct *moving_task; /* a task moving charges */ 355 struct task_struct *moving_task; /* a task moving charges */
356 wait_queue_head_t waitq; /* a waitq for other context */ 356 wait_queue_head_t waitq; /* a waitq for other context */
357 } mc = { 357 } mc = {
358 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 358 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
359 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 359 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
360 }; 360 };
361 361
362 static bool move_anon(void) 362 static bool move_anon(void)
363 { 363 {
364 return test_bit(MOVE_CHARGE_TYPE_ANON, 364 return test_bit(MOVE_CHARGE_TYPE_ANON,
365 &mc.to->move_charge_at_immigrate); 365 &mc.to->move_charge_at_immigrate);
366 } 366 }
367 367
368 static bool move_file(void) 368 static bool move_file(void)
369 { 369 {
370 return test_bit(MOVE_CHARGE_TYPE_FILE, 370 return test_bit(MOVE_CHARGE_TYPE_FILE,
371 &mc.to->move_charge_at_immigrate); 371 &mc.to->move_charge_at_immigrate);
372 } 372 }
373 373
374 /* 374 /*
375 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 375 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
376 * limit reclaim to prevent infinite loops, if they ever occur. 376 * limit reclaim to prevent infinite loops, if they ever occur.
377 */ 377 */
378 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 378 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
379 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 379 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
380 380
381 enum charge_type { 381 enum charge_type {
382 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 382 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
383 MEM_CGROUP_CHARGE_TYPE_MAPPED, 383 MEM_CGROUP_CHARGE_TYPE_MAPPED,
384 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 384 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
385 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 385 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
386 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 386 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
387 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 387 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
388 NR_CHARGE_TYPE, 388 NR_CHARGE_TYPE,
389 }; 389 };
390 390
391 /* for encoding cft->private value on file */ 391 /* for encoding cft->private value on file */
392 #define _MEM (0) 392 #define _MEM (0)
393 #define _MEMSWAP (1) 393 #define _MEMSWAP (1)
394 #define _OOM_TYPE (2) 394 #define _OOM_TYPE (2)
395 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 395 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
396 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 396 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
397 #define MEMFILE_ATTR(val) ((val) & 0xffff) 397 #define MEMFILE_ATTR(val) ((val) & 0xffff)
398 /* Used for OOM nofiier */ 398 /* Used for OOM nofiier */
399 #define OOM_CONTROL (0) 399 #define OOM_CONTROL (0)
400 400
401 /* 401 /*
402 * Reclaim flags for mem_cgroup_hierarchical_reclaim 402 * Reclaim flags for mem_cgroup_hierarchical_reclaim
403 */ 403 */
404 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 404 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
405 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 405 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
406 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 406 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
407 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 407 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
408 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 408 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
409 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 409 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
410 410
411 static void mem_cgroup_get(struct mem_cgroup *mem); 411 static void mem_cgroup_get(struct mem_cgroup *mem);
412 static void mem_cgroup_put(struct mem_cgroup *mem); 412 static void mem_cgroup_put(struct mem_cgroup *mem);
413 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 413 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
414 static void drain_all_stock_async(struct mem_cgroup *mem); 414 static void drain_all_stock_async(struct mem_cgroup *mem);
415 415
416 static struct mem_cgroup_per_zone * 416 static struct mem_cgroup_per_zone *
417 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 417 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
418 { 418 {
419 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 419 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
420 } 420 }
421 421
422 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 422 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
423 { 423 {
424 return &mem->css; 424 return &mem->css;
425 } 425 }
426 426
427 static struct mem_cgroup_per_zone * 427 static struct mem_cgroup_per_zone *
428 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) 428 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
429 { 429 {
430 int nid = page_to_nid(page); 430 int nid = page_to_nid(page);
431 int zid = page_zonenum(page); 431 int zid = page_zonenum(page);
432 432
433 return mem_cgroup_zoneinfo(mem, nid, zid); 433 return mem_cgroup_zoneinfo(mem, nid, zid);
434 } 434 }
435 435
436 static struct mem_cgroup_tree_per_zone * 436 static struct mem_cgroup_tree_per_zone *
437 soft_limit_tree_node_zone(int nid, int zid) 437 soft_limit_tree_node_zone(int nid, int zid)
438 { 438 {
439 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 439 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
440 } 440 }
441 441
442 static struct mem_cgroup_tree_per_zone * 442 static struct mem_cgroup_tree_per_zone *
443 soft_limit_tree_from_page(struct page *page) 443 soft_limit_tree_from_page(struct page *page)
444 { 444 {
445 int nid = page_to_nid(page); 445 int nid = page_to_nid(page);
446 int zid = page_zonenum(page); 446 int zid = page_zonenum(page);
447 447
448 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 448 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
449 } 449 }
450 450
451 static void 451 static void
452 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 452 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
453 struct mem_cgroup_per_zone *mz, 453 struct mem_cgroup_per_zone *mz,
454 struct mem_cgroup_tree_per_zone *mctz, 454 struct mem_cgroup_tree_per_zone *mctz,
455 unsigned long long new_usage_in_excess) 455 unsigned long long new_usage_in_excess)
456 { 456 {
457 struct rb_node **p = &mctz->rb_root.rb_node; 457 struct rb_node **p = &mctz->rb_root.rb_node;
458 struct rb_node *parent = NULL; 458 struct rb_node *parent = NULL;
459 struct mem_cgroup_per_zone *mz_node; 459 struct mem_cgroup_per_zone *mz_node;
460 460
461 if (mz->on_tree) 461 if (mz->on_tree)
462 return; 462 return;
463 463
464 mz->usage_in_excess = new_usage_in_excess; 464 mz->usage_in_excess = new_usage_in_excess;
465 if (!mz->usage_in_excess) 465 if (!mz->usage_in_excess)
466 return; 466 return;
467 while (*p) { 467 while (*p) {
468 parent = *p; 468 parent = *p;
469 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 469 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
470 tree_node); 470 tree_node);
471 if (mz->usage_in_excess < mz_node->usage_in_excess) 471 if (mz->usage_in_excess < mz_node->usage_in_excess)
472 p = &(*p)->rb_left; 472 p = &(*p)->rb_left;
473 /* 473 /*
474 * We can't avoid mem cgroups that are over their soft 474 * We can't avoid mem cgroups that are over their soft
475 * limit by the same amount 475 * limit by the same amount
476 */ 476 */
477 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 477 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
478 p = &(*p)->rb_right; 478 p = &(*p)->rb_right;
479 } 479 }
480 rb_link_node(&mz->tree_node, parent, p); 480 rb_link_node(&mz->tree_node, parent, p);
481 rb_insert_color(&mz->tree_node, &mctz->rb_root); 481 rb_insert_color(&mz->tree_node, &mctz->rb_root);
482 mz->on_tree = true; 482 mz->on_tree = true;
483 } 483 }
484 484
485 static void 485 static void
486 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 486 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
487 struct mem_cgroup_per_zone *mz, 487 struct mem_cgroup_per_zone *mz,
488 struct mem_cgroup_tree_per_zone *mctz) 488 struct mem_cgroup_tree_per_zone *mctz)
489 { 489 {
490 if (!mz->on_tree) 490 if (!mz->on_tree)
491 return; 491 return;
492 rb_erase(&mz->tree_node, &mctz->rb_root); 492 rb_erase(&mz->tree_node, &mctz->rb_root);
493 mz->on_tree = false; 493 mz->on_tree = false;
494 } 494 }
495 495
496 static void 496 static void
497 mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 497 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
498 struct mem_cgroup_per_zone *mz, 498 struct mem_cgroup_per_zone *mz,
499 struct mem_cgroup_tree_per_zone *mctz) 499 struct mem_cgroup_tree_per_zone *mctz)
500 { 500 {
501 spin_lock(&mctz->lock); 501 spin_lock(&mctz->lock);
502 __mem_cgroup_remove_exceeded(mem, mz, mctz); 502 __mem_cgroup_remove_exceeded(mem, mz, mctz);
503 spin_unlock(&mctz->lock); 503 spin_unlock(&mctz->lock);
504 } 504 }
505 505
506 506
507 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 507 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
508 { 508 {
509 unsigned long long excess; 509 unsigned long long excess;
510 struct mem_cgroup_per_zone *mz; 510 struct mem_cgroup_per_zone *mz;
511 struct mem_cgroup_tree_per_zone *mctz; 511 struct mem_cgroup_tree_per_zone *mctz;
512 int nid = page_to_nid(page); 512 int nid = page_to_nid(page);
513 int zid = page_zonenum(page); 513 int zid = page_zonenum(page);
514 mctz = soft_limit_tree_from_page(page); 514 mctz = soft_limit_tree_from_page(page);
515 515
516 /* 516 /*
517 * Necessary to update all ancestors when hierarchy is used. 517 * Necessary to update all ancestors when hierarchy is used.
518 * because their event counter is not touched. 518 * because their event counter is not touched.
519 */ 519 */
520 for (; mem; mem = parent_mem_cgroup(mem)) { 520 for (; mem; mem = parent_mem_cgroup(mem)) {
521 mz = mem_cgroup_zoneinfo(mem, nid, zid); 521 mz = mem_cgroup_zoneinfo(mem, nid, zid);
522 excess = res_counter_soft_limit_excess(&mem->res); 522 excess = res_counter_soft_limit_excess(&mem->res);
523 /* 523 /*
524 * We have to update the tree if mz is on RB-tree or 524 * We have to update the tree if mz is on RB-tree or
525 * mem is over its softlimit. 525 * mem is over its softlimit.
526 */ 526 */
527 if (excess || mz->on_tree) { 527 if (excess || mz->on_tree) {
528 spin_lock(&mctz->lock); 528 spin_lock(&mctz->lock);
529 /* if on-tree, remove it */ 529 /* if on-tree, remove it */
530 if (mz->on_tree) 530 if (mz->on_tree)
531 __mem_cgroup_remove_exceeded(mem, mz, mctz); 531 __mem_cgroup_remove_exceeded(mem, mz, mctz);
532 /* 532 /*
533 * Insert again. mz->usage_in_excess will be updated. 533 * Insert again. mz->usage_in_excess will be updated.
534 * If excess is 0, no tree ops. 534 * If excess is 0, no tree ops.
535 */ 535 */
536 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 536 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
537 spin_unlock(&mctz->lock); 537 spin_unlock(&mctz->lock);
538 } 538 }
539 } 539 }
540 } 540 }
541 541
542 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 542 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
543 { 543 {
544 int node, zone; 544 int node, zone;
545 struct mem_cgroup_per_zone *mz; 545 struct mem_cgroup_per_zone *mz;
546 struct mem_cgroup_tree_per_zone *mctz; 546 struct mem_cgroup_tree_per_zone *mctz;
547 547
548 for_each_node_state(node, N_POSSIBLE) { 548 for_each_node_state(node, N_POSSIBLE) {
549 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 549 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
550 mz = mem_cgroup_zoneinfo(mem, node, zone); 550 mz = mem_cgroup_zoneinfo(mem, node, zone);
551 mctz = soft_limit_tree_node_zone(node, zone); 551 mctz = soft_limit_tree_node_zone(node, zone);
552 mem_cgroup_remove_exceeded(mem, mz, mctz); 552 mem_cgroup_remove_exceeded(mem, mz, mctz);
553 } 553 }
554 } 554 }
555 } 555 }
556 556
557 static struct mem_cgroup_per_zone * 557 static struct mem_cgroup_per_zone *
558 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 558 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
559 { 559 {
560 struct rb_node *rightmost = NULL; 560 struct rb_node *rightmost = NULL;
561 struct mem_cgroup_per_zone *mz; 561 struct mem_cgroup_per_zone *mz;
562 562
563 retry: 563 retry:
564 mz = NULL; 564 mz = NULL;
565 rightmost = rb_last(&mctz->rb_root); 565 rightmost = rb_last(&mctz->rb_root);
566 if (!rightmost) 566 if (!rightmost)
567 goto done; /* Nothing to reclaim from */ 567 goto done; /* Nothing to reclaim from */
568 568
569 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 569 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
570 /* 570 /*
571 * Remove the node now but someone else can add it back, 571 * Remove the node now but someone else can add it back,
572 * we will to add it back at the end of reclaim to its correct 572 * we will to add it back at the end of reclaim to its correct
573 * position in the tree. 573 * position in the tree.
574 */ 574 */
575 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 575 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
576 if (!res_counter_soft_limit_excess(&mz->mem->res) || 576 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
577 !css_tryget(&mz->mem->css)) 577 !css_tryget(&mz->mem->css))
578 goto retry; 578 goto retry;
579 done: 579 done:
580 return mz; 580 return mz;
581 } 581 }
582 582
583 static struct mem_cgroup_per_zone * 583 static struct mem_cgroup_per_zone *
584 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 584 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
585 { 585 {
586 struct mem_cgroup_per_zone *mz; 586 struct mem_cgroup_per_zone *mz;
587 587
588 spin_lock(&mctz->lock); 588 spin_lock(&mctz->lock);
589 mz = __mem_cgroup_largest_soft_limit_node(mctz); 589 mz = __mem_cgroup_largest_soft_limit_node(mctz);
590 spin_unlock(&mctz->lock); 590 spin_unlock(&mctz->lock);
591 return mz; 591 return mz;
592 } 592 }
593 593
594 /* 594 /*
595 * Implementation Note: reading percpu statistics for memcg. 595 * Implementation Note: reading percpu statistics for memcg.
596 * 596 *
597 * Both of vmstat[] and percpu_counter has threshold and do periodic 597 * Both of vmstat[] and percpu_counter has threshold and do periodic
598 * synchronization to implement "quick" read. There are trade-off between 598 * synchronization to implement "quick" read. There are trade-off between
599 * reading cost and precision of value. Then, we may have a chance to implement 599 * reading cost and precision of value. Then, we may have a chance to implement
600 * a periodic synchronizion of counter in memcg's counter. 600 * a periodic synchronizion of counter in memcg's counter.
601 * 601 *
602 * But this _read() function is used for user interface now. The user accounts 602 * But this _read() function is used for user interface now. The user accounts
603 * memory usage by memory cgroup and he _always_ requires exact value because 603 * memory usage by memory cgroup and he _always_ requires exact value because
604 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 604 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
605 * have to visit all online cpus and make sum. So, for now, unnecessary 605 * have to visit all online cpus and make sum. So, for now, unnecessary
606 * synchronization is not implemented. (just implemented for cpu hotplug) 606 * synchronization is not implemented. (just implemented for cpu hotplug)
607 * 607 *
608 * If there are kernel internal actions which can make use of some not-exact 608 * If there are kernel internal actions which can make use of some not-exact
609 * value, and reading all cpu value can be performance bottleneck in some 609 * value, and reading all cpu value can be performance bottleneck in some
610 * common workload, threashold and synchonization as vmstat[] should be 610 * common workload, threashold and synchonization as vmstat[] should be
611 * implemented. 611 * implemented.
612 */ 612 */
613 static long mem_cgroup_read_stat(struct mem_cgroup *mem, 613 static long mem_cgroup_read_stat(struct mem_cgroup *mem,
614 enum mem_cgroup_stat_index idx) 614 enum mem_cgroup_stat_index idx)
615 { 615 {
616 long val = 0; 616 long val = 0;
617 int cpu; 617 int cpu;
618 618
619 get_online_cpus(); 619 get_online_cpus();
620 for_each_online_cpu(cpu) 620 for_each_online_cpu(cpu)
621 val += per_cpu(mem->stat->count[idx], cpu); 621 val += per_cpu(mem->stat->count[idx], cpu);
622 #ifdef CONFIG_HOTPLUG_CPU 622 #ifdef CONFIG_HOTPLUG_CPU
623 spin_lock(&mem->pcp_counter_lock); 623 spin_lock(&mem->pcp_counter_lock);
624 val += mem->nocpu_base.count[idx]; 624 val += mem->nocpu_base.count[idx];
625 spin_unlock(&mem->pcp_counter_lock); 625 spin_unlock(&mem->pcp_counter_lock);
626 #endif 626 #endif
627 put_online_cpus(); 627 put_online_cpus();
628 return val; 628 return val;
629 } 629 }
630 630
631 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 631 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
632 bool charge) 632 bool charge)
633 { 633 {
634 int val = (charge) ? 1 : -1; 634 int val = (charge) ? 1 : -1;
635 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 635 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
636 } 636 }
637 637
638 void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) 638 void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
639 { 639 {
640 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); 640 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
641 } 641 }
642 642
643 void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) 643 void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
644 { 644 {
645 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); 645 this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
646 } 646 }
647 647
648 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 648 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
649 enum mem_cgroup_events_index idx) 649 enum mem_cgroup_events_index idx)
650 { 650 {
651 unsigned long val = 0; 651 unsigned long val = 0;
652 int cpu; 652 int cpu;
653 653
654 for_each_online_cpu(cpu) 654 for_each_online_cpu(cpu)
655 val += per_cpu(mem->stat->events[idx], cpu); 655 val += per_cpu(mem->stat->events[idx], cpu);
656 #ifdef CONFIG_HOTPLUG_CPU 656 #ifdef CONFIG_HOTPLUG_CPU
657 spin_lock(&mem->pcp_counter_lock); 657 spin_lock(&mem->pcp_counter_lock);
658 val += mem->nocpu_base.events[idx]; 658 val += mem->nocpu_base.events[idx];
659 spin_unlock(&mem->pcp_counter_lock); 659 spin_unlock(&mem->pcp_counter_lock);
660 #endif 660 #endif
661 return val; 661 return val;
662 } 662 }
663 663
664 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 664 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
665 bool file, int nr_pages) 665 bool file, int nr_pages)
666 { 666 {
667 preempt_disable(); 667 preempt_disable();
668 668
669 if (file) 669 if (file)
670 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 670 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
671 else 671 else
672 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 672 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
673 673
674 /* pagein of a big page is an event. So, ignore page size */ 674 /* pagein of a big page is an event. So, ignore page size */
675 if (nr_pages > 0) 675 if (nr_pages > 0)
676 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 676 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
677 else { 677 else {
678 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 678 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
679 nr_pages = -nr_pages; /* for event */ 679 nr_pages = -nr_pages; /* for event */
680 } 680 }
681 681
682 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 682 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
683 683
684 preempt_enable(); 684 preempt_enable();
685 } 685 }
686 686
687 unsigned long 687 unsigned long
688 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, 688 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
689 unsigned int lru_mask) 689 unsigned int lru_mask)
690 { 690 {
691 struct mem_cgroup_per_zone *mz; 691 struct mem_cgroup_per_zone *mz;
692 enum lru_list l; 692 enum lru_list l;
693 unsigned long ret = 0; 693 unsigned long ret = 0;
694 694
695 mz = mem_cgroup_zoneinfo(mem, nid, zid); 695 mz = mem_cgroup_zoneinfo(mem, nid, zid);
696 696
697 for_each_lru(l) { 697 for_each_lru(l) {
698 if (BIT(l) & lru_mask) 698 if (BIT(l) & lru_mask)
699 ret += MEM_CGROUP_ZSTAT(mz, l); 699 ret += MEM_CGROUP_ZSTAT(mz, l);
700 } 700 }
701 return ret; 701 return ret;
702 } 702 }
703 703
704 static unsigned long 704 static unsigned long
705 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, 705 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
706 int nid, unsigned int lru_mask) 706 int nid, unsigned int lru_mask)
707 { 707 {
708 u64 total = 0; 708 u64 total = 0;
709 int zid; 709 int zid;
710 710
711 for (zid = 0; zid < MAX_NR_ZONES; zid++) 711 for (zid = 0; zid < MAX_NR_ZONES; zid++)
712 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); 712 total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
713 713
714 return total; 714 return total;
715 } 715 }
716 716
717 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, 717 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
718 unsigned int lru_mask) 718 unsigned int lru_mask)
719 { 719 {
720 int nid; 720 int nid;
721 u64 total = 0; 721 u64 total = 0;
722 722
723 for_each_node_state(nid, N_HIGH_MEMORY) 723 for_each_node_state(nid, N_HIGH_MEMORY)
724 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); 724 total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
725 return total; 725 return total;
726 } 726 }
727 727
728 static bool __memcg_event_check(struct mem_cgroup *mem, int target) 728 static bool __memcg_event_check(struct mem_cgroup *mem, int target)
729 { 729 {
730 unsigned long val, next; 730 unsigned long val, next;
731 731
732 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 732 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
733 next = this_cpu_read(mem->stat->targets[target]); 733 next = this_cpu_read(mem->stat->targets[target]);
734 /* from time_after() in jiffies.h */ 734 /* from time_after() in jiffies.h */
735 return ((long)next - (long)val < 0); 735 return ((long)next - (long)val < 0);
736 } 736 }
737 737
738 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) 738 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
739 { 739 {
740 unsigned long val, next; 740 unsigned long val, next;
741 741
742 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 742 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
743 743
744 switch (target) { 744 switch (target) {
745 case MEM_CGROUP_TARGET_THRESH: 745 case MEM_CGROUP_TARGET_THRESH:
746 next = val + THRESHOLDS_EVENTS_TARGET; 746 next = val + THRESHOLDS_EVENTS_TARGET;
747 break; 747 break;
748 case MEM_CGROUP_TARGET_SOFTLIMIT: 748 case MEM_CGROUP_TARGET_SOFTLIMIT:
749 next = val + SOFTLIMIT_EVENTS_TARGET; 749 next = val + SOFTLIMIT_EVENTS_TARGET;
750 break; 750 break;
751 case MEM_CGROUP_TARGET_NUMAINFO: 751 case MEM_CGROUP_TARGET_NUMAINFO:
752 next = val + NUMAINFO_EVENTS_TARGET; 752 next = val + NUMAINFO_EVENTS_TARGET;
753 break; 753 break;
754 default: 754 default:
755 return; 755 return;
756 } 756 }
757 757
758 this_cpu_write(mem->stat->targets[target], next); 758 this_cpu_write(mem->stat->targets[target], next);
759 } 759 }
760 760
761 /* 761 /*
762 * Check events in order. 762 * Check events in order.
763 * 763 *
764 */ 764 */
765 static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 765 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
766 { 766 {
767 /* threshold event is triggered in finer grain than soft limit */ 767 /* threshold event is triggered in finer grain than soft limit */
768 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { 768 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
769 mem_cgroup_threshold(mem); 769 mem_cgroup_threshold(mem);
770 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 770 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
771 if (unlikely(__memcg_event_check(mem, 771 if (unlikely(__memcg_event_check(mem,
772 MEM_CGROUP_TARGET_SOFTLIMIT))) { 772 MEM_CGROUP_TARGET_SOFTLIMIT))) {
773 mem_cgroup_update_tree(mem, page); 773 mem_cgroup_update_tree(mem, page);
774 __mem_cgroup_target_update(mem, 774 __mem_cgroup_target_update(mem,
775 MEM_CGROUP_TARGET_SOFTLIMIT); 775 MEM_CGROUP_TARGET_SOFTLIMIT);
776 } 776 }
777 #if MAX_NUMNODES > 1 777 #if MAX_NUMNODES > 1
778 if (unlikely(__memcg_event_check(mem, 778 if (unlikely(__memcg_event_check(mem,
779 MEM_CGROUP_TARGET_NUMAINFO))) { 779 MEM_CGROUP_TARGET_NUMAINFO))) {
780 atomic_inc(&mem->numainfo_events); 780 atomic_inc(&mem->numainfo_events);
781 __mem_cgroup_target_update(mem, 781 __mem_cgroup_target_update(mem,
782 MEM_CGROUP_TARGET_NUMAINFO); 782 MEM_CGROUP_TARGET_NUMAINFO);
783 } 783 }
784 #endif 784 #endif
785 } 785 }
786 } 786 }
787 787
788 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 788 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
789 { 789 {
790 return container_of(cgroup_subsys_state(cont, 790 return container_of(cgroup_subsys_state(cont,
791 mem_cgroup_subsys_id), struct mem_cgroup, 791 mem_cgroup_subsys_id), struct mem_cgroup,
792 css); 792 css);
793 } 793 }
794 794
795 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 795 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
796 { 796 {
797 /* 797 /*
798 * mm_update_next_owner() may clear mm->owner to NULL 798 * mm_update_next_owner() may clear mm->owner to NULL
799 * if it races with swapoff, page migration, etc. 799 * if it races with swapoff, page migration, etc.
800 * So this can be called with p == NULL. 800 * So this can be called with p == NULL.
801 */ 801 */
802 if (unlikely(!p)) 802 if (unlikely(!p))
803 return NULL; 803 return NULL;
804 804
805 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 805 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
806 struct mem_cgroup, css); 806 struct mem_cgroup, css);
807 } 807 }
808 808
809 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 809 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
810 { 810 {
811 struct mem_cgroup *mem = NULL; 811 struct mem_cgroup *mem = NULL;
812 812
813 if (!mm) 813 if (!mm)
814 return NULL; 814 return NULL;
815 /* 815 /*
816 * Because we have no locks, mm->owner's may be being moved to other 816 * Because we have no locks, mm->owner's may be being moved to other
817 * cgroup. We use css_tryget() here even if this looks 817 * cgroup. We use css_tryget() here even if this looks
818 * pessimistic (rather than adding locks here). 818 * pessimistic (rather than adding locks here).
819 */ 819 */
820 rcu_read_lock(); 820 rcu_read_lock();
821 do { 821 do {
822 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 822 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
823 if (unlikely(!mem)) 823 if (unlikely(!mem))
824 break; 824 break;
825 } while (!css_tryget(&mem->css)); 825 } while (!css_tryget(&mem->css));
826 rcu_read_unlock(); 826 rcu_read_unlock();
827 return mem; 827 return mem;
828 } 828 }
829 829
830 /* The caller has to guarantee "mem" exists before calling this */ 830 /* The caller has to guarantee "mem" exists before calling this */
831 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 831 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
832 { 832 {
833 struct cgroup_subsys_state *css; 833 struct cgroup_subsys_state *css;
834 int found; 834 int found;
835 835
836 if (!mem) /* ROOT cgroup has the smallest ID */ 836 if (!mem) /* ROOT cgroup has the smallest ID */
837 return root_mem_cgroup; /*css_put/get against root is ignored*/ 837 return root_mem_cgroup; /*css_put/get against root is ignored*/
838 if (!mem->use_hierarchy) { 838 if (!mem->use_hierarchy) {
839 if (css_tryget(&mem->css)) 839 if (css_tryget(&mem->css))
840 return mem; 840 return mem;
841 return NULL; 841 return NULL;
842 } 842 }
843 rcu_read_lock(); 843 rcu_read_lock();
844 /* 844 /*
845 * searching a memory cgroup which has the smallest ID under given 845 * searching a memory cgroup which has the smallest ID under given
846 * ROOT cgroup. (ID >= 1) 846 * ROOT cgroup. (ID >= 1)
847 */ 847 */
848 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 848 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
849 if (css && css_tryget(css)) 849 if (css && css_tryget(css))
850 mem = container_of(css, struct mem_cgroup, css); 850 mem = container_of(css, struct mem_cgroup, css);
851 else 851 else
852 mem = NULL; 852 mem = NULL;
853 rcu_read_unlock(); 853 rcu_read_unlock();
854 return mem; 854 return mem;
855 } 855 }
856 856
857 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 857 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
858 struct mem_cgroup *root, 858 struct mem_cgroup *root,
859 bool cond) 859 bool cond)
860 { 860 {
861 int nextid = css_id(&iter->css) + 1; 861 int nextid = css_id(&iter->css) + 1;
862 int found; 862 int found;
863 int hierarchy_used; 863 int hierarchy_used;
864 struct cgroup_subsys_state *css; 864 struct cgroup_subsys_state *css;
865 865
866 hierarchy_used = iter->use_hierarchy; 866 hierarchy_used = iter->use_hierarchy;
867 867
868 css_put(&iter->css); 868 css_put(&iter->css);
869 /* If no ROOT, walk all, ignore hierarchy */ 869 /* If no ROOT, walk all, ignore hierarchy */
870 if (!cond || (root && !hierarchy_used)) 870 if (!cond || (root && !hierarchy_used))
871 return NULL; 871 return NULL;
872 872
873 if (!root) 873 if (!root)
874 root = root_mem_cgroup; 874 root = root_mem_cgroup;
875 875
876 do { 876 do {
877 iter = NULL; 877 iter = NULL;
878 rcu_read_lock(); 878 rcu_read_lock();
879 879
880 css = css_get_next(&mem_cgroup_subsys, nextid, 880 css = css_get_next(&mem_cgroup_subsys, nextid,
881 &root->css, &found); 881 &root->css, &found);
882 if (css && css_tryget(css)) 882 if (css && css_tryget(css))
883 iter = container_of(css, struct mem_cgroup, css); 883 iter = container_of(css, struct mem_cgroup, css);
884 rcu_read_unlock(); 884 rcu_read_unlock();
885 /* If css is NULL, no more cgroups will be found */ 885 /* If css is NULL, no more cgroups will be found */
886 nextid = found + 1; 886 nextid = found + 1;
887 } while (css && !iter); 887 } while (css && !iter);
888 888
889 return iter; 889 return iter;
890 } 890 }
891 /* 891 /*
892 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please 892 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
893 * be careful that "break" loop is not allowed. We have reference count. 893 * be careful that "break" loop is not allowed. We have reference count.
894 * Instead of that modify "cond" to be false and "continue" to exit the loop. 894 * Instead of that modify "cond" to be false and "continue" to exit the loop.
895 */ 895 */
896 #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ 896 #define for_each_mem_cgroup_tree_cond(iter, root, cond) \
897 for (iter = mem_cgroup_start_loop(root);\ 897 for (iter = mem_cgroup_start_loop(root);\
898 iter != NULL;\ 898 iter != NULL;\
899 iter = mem_cgroup_get_next(iter, root, cond)) 899 iter = mem_cgroup_get_next(iter, root, cond))
900 900
901 #define for_each_mem_cgroup_tree(iter, root) \ 901 #define for_each_mem_cgroup_tree(iter, root) \
902 for_each_mem_cgroup_tree_cond(iter, root, true) 902 for_each_mem_cgroup_tree_cond(iter, root, true)
903 903
904 #define for_each_mem_cgroup_all(iter) \ 904 #define for_each_mem_cgroup_all(iter) \
905 for_each_mem_cgroup_tree_cond(iter, NULL, true) 905 for_each_mem_cgroup_tree_cond(iter, NULL, true)
906 906
907 907
908 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 908 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
909 { 909 {
910 return (mem == root_mem_cgroup); 910 return (mem == root_mem_cgroup);
911 } 911 }
912 912
913 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 913 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
914 { 914 {
915 struct mem_cgroup *mem; 915 struct mem_cgroup *mem;
916 916
917 if (!mm) 917 if (!mm)
918 return; 918 return;
919 919
920 rcu_read_lock(); 920 rcu_read_lock();
921 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 921 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
922 if (unlikely(!mem)) 922 if (unlikely(!mem))
923 goto out; 923 goto out;
924 924
925 switch (idx) { 925 switch (idx) {
926 case PGMAJFAULT: 926 case PGMAJFAULT:
927 mem_cgroup_pgmajfault(mem, 1); 927 mem_cgroup_pgmajfault(mem, 1);
928 break; 928 break;
929 case PGFAULT: 929 case PGFAULT:
930 mem_cgroup_pgfault(mem, 1); 930 mem_cgroup_pgfault(mem, 1);
931 break; 931 break;
932 default: 932 default:
933 BUG(); 933 BUG();
934 } 934 }
935 out: 935 out:
936 rcu_read_unlock(); 936 rcu_read_unlock();
937 } 937 }
938 EXPORT_SYMBOL(mem_cgroup_count_vm_event); 938 EXPORT_SYMBOL(mem_cgroup_count_vm_event);
939 939
940 /* 940 /*
941 * Following LRU functions are allowed to be used without PCG_LOCK. 941 * Following LRU functions are allowed to be used without PCG_LOCK.
942 * Operations are called by routine of global LRU independently from memcg. 942 * Operations are called by routine of global LRU independently from memcg.
943 * What we have to take care of here is validness of pc->mem_cgroup. 943 * What we have to take care of here is validness of pc->mem_cgroup.
944 * 944 *
945 * Changes to pc->mem_cgroup happens when 945 * Changes to pc->mem_cgroup happens when
946 * 1. charge 946 * 1. charge
947 * 2. moving account 947 * 2. moving account
948 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 948 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
949 * It is added to LRU before charge. 949 * It is added to LRU before charge.
950 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 950 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
951 * When moving account, the page is not on LRU. It's isolated. 951 * When moving account, the page is not on LRU. It's isolated.
952 */ 952 */
953 953
954 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 954 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
955 { 955 {
956 struct page_cgroup *pc; 956 struct page_cgroup *pc;
957 struct mem_cgroup_per_zone *mz; 957 struct mem_cgroup_per_zone *mz;
958 958
959 if (mem_cgroup_disabled()) 959 if (mem_cgroup_disabled())
960 return; 960 return;
961 pc = lookup_page_cgroup(page); 961 pc = lookup_page_cgroup(page);
962 /* can happen while we handle swapcache. */ 962 /* can happen while we handle swapcache. */
963 if (!TestClearPageCgroupAcctLRU(pc)) 963 if (!TestClearPageCgroupAcctLRU(pc))
964 return; 964 return;
965 VM_BUG_ON(!pc->mem_cgroup); 965 VM_BUG_ON(!pc->mem_cgroup);
966 /* 966 /*
967 * We don't check PCG_USED bit. It's cleared when the "page" is finally 967 * We don't check PCG_USED bit. It's cleared when the "page" is finally
968 * removed from global LRU. 968 * removed from global LRU.
969 */ 969 */
970 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 970 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
971 /* huge page split is done under lru_lock. so, we have no races. */ 971 /* huge page split is done under lru_lock. so, we have no races. */
972 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 972 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
973 if (mem_cgroup_is_root(pc->mem_cgroup)) 973 if (mem_cgroup_is_root(pc->mem_cgroup))
974 return; 974 return;
975 VM_BUG_ON(list_empty(&pc->lru)); 975 VM_BUG_ON(list_empty(&pc->lru));
976 list_del_init(&pc->lru); 976 list_del_init(&pc->lru);
977 } 977 }
978 978
979 void mem_cgroup_del_lru(struct page *page) 979 void mem_cgroup_del_lru(struct page *page)
980 { 980 {
981 mem_cgroup_del_lru_list(page, page_lru(page)); 981 mem_cgroup_del_lru_list(page, page_lru(page));
982 } 982 }
983 983
984 /* 984 /*
985 * Writeback is about to end against a page which has been marked for immediate 985 * Writeback is about to end against a page which has been marked for immediate
986 * reclaim. If it still appears to be reclaimable, move it to the tail of the 986 * reclaim. If it still appears to be reclaimable, move it to the tail of the
987 * inactive list. 987 * inactive list.
988 */ 988 */
989 void mem_cgroup_rotate_reclaimable_page(struct page *page) 989 void mem_cgroup_rotate_reclaimable_page(struct page *page)
990 { 990 {
991 struct mem_cgroup_per_zone *mz; 991 struct mem_cgroup_per_zone *mz;
992 struct page_cgroup *pc; 992 struct page_cgroup *pc;
993 enum lru_list lru = page_lru(page); 993 enum lru_list lru = page_lru(page);
994 994
995 if (mem_cgroup_disabled()) 995 if (mem_cgroup_disabled())
996 return; 996 return;
997 997
998 pc = lookup_page_cgroup(page); 998 pc = lookup_page_cgroup(page);
999 /* unused or root page is not rotated. */ 999 /* unused or root page is not rotated. */
1000 if (!PageCgroupUsed(pc)) 1000 if (!PageCgroupUsed(pc))
1001 return; 1001 return;
1002 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1002 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1003 smp_rmb(); 1003 smp_rmb();
1004 if (mem_cgroup_is_root(pc->mem_cgroup)) 1004 if (mem_cgroup_is_root(pc->mem_cgroup))
1005 return; 1005 return;
1006 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1006 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1007 list_move_tail(&pc->lru, &mz->lists[lru]); 1007 list_move_tail(&pc->lru, &mz->lists[lru]);
1008 } 1008 }
1009 1009
1010 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 1010 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
1011 { 1011 {
1012 struct mem_cgroup_per_zone *mz; 1012 struct mem_cgroup_per_zone *mz;
1013 struct page_cgroup *pc; 1013 struct page_cgroup *pc;
1014 1014
1015 if (mem_cgroup_disabled()) 1015 if (mem_cgroup_disabled())
1016 return; 1016 return;
1017 1017
1018 pc = lookup_page_cgroup(page); 1018 pc = lookup_page_cgroup(page);
1019 /* unused or root page is not rotated. */ 1019 /* unused or root page is not rotated. */
1020 if (!PageCgroupUsed(pc)) 1020 if (!PageCgroupUsed(pc))
1021 return; 1021 return;
1022 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1022 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1023 smp_rmb(); 1023 smp_rmb();
1024 if (mem_cgroup_is_root(pc->mem_cgroup)) 1024 if (mem_cgroup_is_root(pc->mem_cgroup))
1025 return; 1025 return;
1026 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1026 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1027 list_move(&pc->lru, &mz->lists[lru]); 1027 list_move(&pc->lru, &mz->lists[lru]);
1028 } 1028 }
1029 1029
1030 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 1030 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
1031 { 1031 {
1032 struct page_cgroup *pc; 1032 struct page_cgroup *pc;
1033 struct mem_cgroup_per_zone *mz; 1033 struct mem_cgroup_per_zone *mz;
1034 1034
1035 if (mem_cgroup_disabled()) 1035 if (mem_cgroup_disabled())
1036 return; 1036 return;
1037 pc = lookup_page_cgroup(page); 1037 pc = lookup_page_cgroup(page);
1038 VM_BUG_ON(PageCgroupAcctLRU(pc)); 1038 VM_BUG_ON(PageCgroupAcctLRU(pc));
1039 if (!PageCgroupUsed(pc)) 1039 if (!PageCgroupUsed(pc))
1040 return; 1040 return;
1041 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1041 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1042 smp_rmb(); 1042 smp_rmb();
1043 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1043 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1044 /* huge page split is done under lru_lock. so, we have no races. */ 1044 /* huge page split is done under lru_lock. so, we have no races. */
1045 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 1045 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
1046 SetPageCgroupAcctLRU(pc); 1046 SetPageCgroupAcctLRU(pc);
1047 if (mem_cgroup_is_root(pc->mem_cgroup)) 1047 if (mem_cgroup_is_root(pc->mem_cgroup))
1048 return; 1048 return;
1049 list_add(&pc->lru, &mz->lists[lru]); 1049 list_add(&pc->lru, &mz->lists[lru]);
1050 } 1050 }
1051 1051
1052 /* 1052 /*
1053 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed 1053 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
1054 * while it's linked to lru because the page may be reused after it's fully 1054 * while it's linked to lru because the page may be reused after it's fully
1055 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. 1055 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
1056 * It's done under lock_page and expected that zone->lru_lock isnever held. 1056 * It's done under lock_page and expected that zone->lru_lock isnever held.
1057 */ 1057 */
1058 static void mem_cgroup_lru_del_before_commit(struct page *page) 1058 static void mem_cgroup_lru_del_before_commit(struct page *page)
1059 { 1059 {
1060 unsigned long flags; 1060 unsigned long flags;
1061 struct zone *zone = page_zone(page); 1061 struct zone *zone = page_zone(page);
1062 struct page_cgroup *pc = lookup_page_cgroup(page); 1062 struct page_cgroup *pc = lookup_page_cgroup(page);
1063 1063
1064 /* 1064 /*
1065 * Doing this check without taking ->lru_lock seems wrong but this 1065 * Doing this check without taking ->lru_lock seems wrong but this
1066 * is safe. Because if page_cgroup's USED bit is unset, the page 1066 * is safe. Because if page_cgroup's USED bit is unset, the page
1067 * will not be added to any memcg's LRU. If page_cgroup's USED bit is 1067 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
1068 * set, the commit after this will fail, anyway. 1068 * set, the commit after this will fail, anyway.
1069 * This all charge/uncharge is done under some mutual execustion. 1069 * This all charge/uncharge is done under some mutual execustion.
1070 * So, we don't need to taking care of changes in USED bit. 1070 * So, we don't need to taking care of changes in USED bit.
1071 */ 1071 */
1072 if (likely(!PageLRU(page))) 1072 if (likely(!PageLRU(page)))
1073 return; 1073 return;
1074 1074
1075 spin_lock_irqsave(&zone->lru_lock, flags); 1075 spin_lock_irqsave(&zone->lru_lock, flags);
1076 /* 1076 /*
1077 * Forget old LRU when this page_cgroup is *not* used. This Used bit 1077 * Forget old LRU when this page_cgroup is *not* used. This Used bit
1078 * is guarded by lock_page() because the page is SwapCache. 1078 * is guarded by lock_page() because the page is SwapCache.
1079 */ 1079 */
1080 if (!PageCgroupUsed(pc)) 1080 if (!PageCgroupUsed(pc))
1081 mem_cgroup_del_lru_list(page, page_lru(page)); 1081 mem_cgroup_del_lru_list(page, page_lru(page));
1082 spin_unlock_irqrestore(&zone->lru_lock, flags); 1082 spin_unlock_irqrestore(&zone->lru_lock, flags);
1083 } 1083 }
1084 1084
1085 static void mem_cgroup_lru_add_after_commit(struct page *page) 1085 static void mem_cgroup_lru_add_after_commit(struct page *page)
1086 { 1086 {
1087 unsigned long flags; 1087 unsigned long flags;
1088 struct zone *zone = page_zone(page); 1088 struct zone *zone = page_zone(page);
1089 struct page_cgroup *pc = lookup_page_cgroup(page); 1089 struct page_cgroup *pc = lookup_page_cgroup(page);
1090 1090
1091 /* taking care of that the page is added to LRU while we commit it */ 1091 /* taking care of that the page is added to LRU while we commit it */
1092 if (likely(!PageLRU(page))) 1092 if (likely(!PageLRU(page)))
1093 return; 1093 return;
1094 spin_lock_irqsave(&zone->lru_lock, flags); 1094 spin_lock_irqsave(&zone->lru_lock, flags);
1095 /* link when the page is linked to LRU but page_cgroup isn't */ 1095 /* link when the page is linked to LRU but page_cgroup isn't */
1096 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 1096 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
1097 mem_cgroup_add_lru_list(page, page_lru(page)); 1097 mem_cgroup_add_lru_list(page, page_lru(page));
1098 spin_unlock_irqrestore(&zone->lru_lock, flags); 1098 spin_unlock_irqrestore(&zone->lru_lock, flags);
1099 } 1099 }
1100 1100
1101 1101
1102 void mem_cgroup_move_lists(struct page *page, 1102 void mem_cgroup_move_lists(struct page *page,
1103 enum lru_list from, enum lru_list to) 1103 enum lru_list from, enum lru_list to)
1104 { 1104 {
1105 if (mem_cgroup_disabled()) 1105 if (mem_cgroup_disabled())
1106 return; 1106 return;
1107 mem_cgroup_del_lru_list(page, from); 1107 mem_cgroup_del_lru_list(page, from);
1108 mem_cgroup_add_lru_list(page, to); 1108 mem_cgroup_add_lru_list(page, to);
1109 } 1109 }
1110 1110
1111 /* 1111 /*
1112 * Checks whether given mem is same or in the root_mem's 1112 * Checks whether given mem is same or in the root_mem's
1113 * hierarchy subtree 1113 * hierarchy subtree
1114 */ 1114 */
1115 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, 1115 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
1116 struct mem_cgroup *mem) 1116 struct mem_cgroup *mem)
1117 { 1117 {
1118 if (root_mem != mem) { 1118 if (root_mem != mem) {
1119 return (root_mem->use_hierarchy && 1119 return (root_mem->use_hierarchy &&
1120 css_is_ancestor(&mem->css, &root_mem->css)); 1120 css_is_ancestor(&mem->css, &root_mem->css));
1121 } 1121 }
1122 1122
1123 return true; 1123 return true;
1124 } 1124 }
1125 1125
1126 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 1126 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
1127 { 1127 {
1128 int ret; 1128 int ret;
1129 struct mem_cgroup *curr = NULL; 1129 struct mem_cgroup *curr = NULL;
1130 struct task_struct *p; 1130 struct task_struct *p;
1131 1131
1132 p = find_lock_task_mm(task); 1132 p = find_lock_task_mm(task);
1133 if (!p) 1133 if (!p)
1134 return 0; 1134 return 0;
1135 curr = try_get_mem_cgroup_from_mm(p->mm); 1135 curr = try_get_mem_cgroup_from_mm(p->mm);
1136 task_unlock(p); 1136 task_unlock(p);
1137 if (!curr) 1137 if (!curr)
1138 return 0; 1138 return 0;
1139 /* 1139 /*
1140 * We should check use_hierarchy of "mem" not "curr". Because checking 1140 * We should check use_hierarchy of "mem" not "curr". Because checking
1141 * use_hierarchy of "curr" here make this function true if hierarchy is 1141 * use_hierarchy of "curr" here make this function true if hierarchy is
1142 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1142 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
1143 * hierarchy(even if use_hierarchy is disabled in "mem"). 1143 * hierarchy(even if use_hierarchy is disabled in "mem").
1144 */ 1144 */
1145 ret = mem_cgroup_same_or_subtree(mem, curr); 1145 ret = mem_cgroup_same_or_subtree(mem, curr);
1146 css_put(&curr->css); 1146 css_put(&curr->css);
1147 return ret; 1147 return ret;
1148 } 1148 }
1149 1149
1150 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 1150 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
1151 { 1151 {
1152 unsigned long active; 1152 unsigned long active;
1153 unsigned long inactive; 1153 unsigned long inactive;
1154 unsigned long gb; 1154 unsigned long gb;
1155 unsigned long inactive_ratio; 1155 unsigned long inactive_ratio;
1156 1156
1157 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); 1157 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
1158 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); 1158 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
1159 1159
1160 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1160 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1161 if (gb) 1161 if (gb)
1162 inactive_ratio = int_sqrt(10 * gb); 1162 inactive_ratio = int_sqrt(10 * gb);
1163 else 1163 else
1164 inactive_ratio = 1; 1164 inactive_ratio = 1;
1165 1165
1166 if (present_pages) { 1166 if (present_pages) {
1167 present_pages[0] = inactive; 1167 present_pages[0] = inactive;
1168 present_pages[1] = active; 1168 present_pages[1] = active;
1169 } 1169 }
1170 1170
1171 return inactive_ratio; 1171 return inactive_ratio;
1172 } 1172 }
1173 1173
1174 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 1174 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
1175 { 1175 {
1176 unsigned long active; 1176 unsigned long active;
1177 unsigned long inactive; 1177 unsigned long inactive;
1178 unsigned long present_pages[2]; 1178 unsigned long present_pages[2];
1179 unsigned long inactive_ratio; 1179 unsigned long inactive_ratio;
1180 1180
1181 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 1181 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
1182 1182
1183 inactive = present_pages[0]; 1183 inactive = present_pages[0];
1184 active = present_pages[1]; 1184 active = present_pages[1];
1185 1185
1186 if (inactive * inactive_ratio < active) 1186 if (inactive * inactive_ratio < active)
1187 return 1; 1187 return 1;
1188 1188
1189 return 0; 1189 return 0;
1190 } 1190 }
1191 1191
1192 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 1192 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1193 { 1193 {
1194 unsigned long active; 1194 unsigned long active;
1195 unsigned long inactive; 1195 unsigned long inactive;
1196 1196
1197 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); 1197 inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
1198 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); 1198 active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
1199 1199
1200 return (active > inactive); 1200 return (active > inactive);
1201 } 1201 }
1202 1202
1203 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1203 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1204 struct zone *zone) 1204 struct zone *zone)
1205 { 1205 {
1206 int nid = zone_to_nid(zone); 1206 int nid = zone_to_nid(zone);
1207 int zid = zone_idx(zone); 1207 int zid = zone_idx(zone);
1208 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1208 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1209 1209
1210 return &mz->reclaim_stat; 1210 return &mz->reclaim_stat;
1211 } 1211 }
1212 1212
1213 struct zone_reclaim_stat * 1213 struct zone_reclaim_stat *
1214 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1214 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1215 { 1215 {
1216 struct page_cgroup *pc; 1216 struct page_cgroup *pc;
1217 struct mem_cgroup_per_zone *mz; 1217 struct mem_cgroup_per_zone *mz;
1218 1218
1219 if (mem_cgroup_disabled()) 1219 if (mem_cgroup_disabled())
1220 return NULL; 1220 return NULL;
1221 1221
1222 pc = lookup_page_cgroup(page); 1222 pc = lookup_page_cgroup(page);
1223 if (!PageCgroupUsed(pc)) 1223 if (!PageCgroupUsed(pc))
1224 return NULL; 1224 return NULL;
1225 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1225 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1226 smp_rmb(); 1226 smp_rmb();
1227 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1227 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1228 return &mz->reclaim_stat; 1228 return &mz->reclaim_stat;
1229 } 1229 }
1230 1230
1231 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1231 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1232 struct list_head *dst, 1232 struct list_head *dst,
1233 unsigned long *scanned, int order, 1233 unsigned long *scanned, int order,
1234 int mode, struct zone *z, 1234 int mode, struct zone *z,
1235 struct mem_cgroup *mem_cont, 1235 struct mem_cgroup *mem_cont,
1236 int active, int file) 1236 int active, int file)
1237 { 1237 {
1238 unsigned long nr_taken = 0; 1238 unsigned long nr_taken = 0;
1239 struct page *page; 1239 struct page *page;
1240 unsigned long scan; 1240 unsigned long scan;
1241 LIST_HEAD(pc_list); 1241 LIST_HEAD(pc_list);
1242 struct list_head *src; 1242 struct list_head *src;
1243 struct page_cgroup *pc, *tmp; 1243 struct page_cgroup *pc, *tmp;
1244 int nid = zone_to_nid(z); 1244 int nid = zone_to_nid(z);
1245 int zid = zone_idx(z); 1245 int zid = zone_idx(z);
1246 struct mem_cgroup_per_zone *mz; 1246 struct mem_cgroup_per_zone *mz;
1247 int lru = LRU_FILE * file + active; 1247 int lru = LRU_FILE * file + active;
1248 int ret; 1248 int ret;
1249 1249
1250 BUG_ON(!mem_cont); 1250 BUG_ON(!mem_cont);
1251 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1251 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1252 src = &mz->lists[lru]; 1252 src = &mz->lists[lru];
1253 1253
1254 scan = 0; 1254 scan = 0;
1255 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1255 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1256 if (scan >= nr_to_scan) 1256 if (scan >= nr_to_scan)
1257 break; 1257 break;
1258 1258
1259 if (unlikely(!PageCgroupUsed(pc))) 1259 if (unlikely(!PageCgroupUsed(pc)))
1260 continue; 1260 continue;
1261 1261
1262 page = lookup_cgroup_page(pc); 1262 page = lookup_cgroup_page(pc);
1263 1263
1264 if (unlikely(!PageLRU(page))) 1264 if (unlikely(!PageLRU(page)))
1265 continue; 1265 continue;
1266 1266
1267 scan++; 1267 scan++;
1268 ret = __isolate_lru_page(page, mode, file); 1268 ret = __isolate_lru_page(page, mode, file);
1269 switch (ret) { 1269 switch (ret) {
1270 case 0: 1270 case 0:
1271 list_move(&page->lru, dst); 1271 list_move(&page->lru, dst);
1272 mem_cgroup_del_lru(page); 1272 mem_cgroup_del_lru(page);
1273 nr_taken += hpage_nr_pages(page); 1273 nr_taken += hpage_nr_pages(page);
1274 break; 1274 break;
1275 case -EBUSY: 1275 case -EBUSY:
1276 /* we don't affect global LRU but rotate in our LRU */ 1276 /* we don't affect global LRU but rotate in our LRU */
1277 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1277 mem_cgroup_rotate_lru_list(page, page_lru(page));
1278 break; 1278 break;
1279 default: 1279 default:
1280 break; 1280 break;
1281 } 1281 }
1282 } 1282 }
1283 1283
1284 *scanned = scan; 1284 *scanned = scan;
1285 1285
1286 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1286 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1287 0, 0, 0, mode); 1287 0, 0, 0, mode);
1288 1288
1289 return nr_taken; 1289 return nr_taken;
1290 } 1290 }
1291 1291
1292 #define mem_cgroup_from_res_counter(counter, member) \ 1292 #define mem_cgroup_from_res_counter(counter, member) \
1293 container_of(counter, struct mem_cgroup, member) 1293 container_of(counter, struct mem_cgroup, member)
1294 1294
1295 /** 1295 /**
1296 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1296 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1297 * @mem: the memory cgroup 1297 * @mem: the memory cgroup
1298 * 1298 *
1299 * Returns the maximum amount of memory @mem can be charged with, in 1299 * Returns the maximum amount of memory @mem can be charged with, in
1300 * pages. 1300 * pages.
1301 */ 1301 */
1302 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) 1302 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1303 { 1303 {
1304 unsigned long long margin; 1304 unsigned long long margin;
1305 1305
1306 margin = res_counter_margin(&mem->res); 1306 margin = res_counter_margin(&mem->res);
1307 if (do_swap_account) 1307 if (do_swap_account)
1308 margin = min(margin, res_counter_margin(&mem->memsw)); 1308 margin = min(margin, res_counter_margin(&mem->memsw));
1309 return margin >> PAGE_SHIFT; 1309 return margin >> PAGE_SHIFT;
1310 } 1310 }
1311 1311
1312 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1312 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1313 { 1313 {
1314 struct cgroup *cgrp = memcg->css.cgroup; 1314 struct cgroup *cgrp = memcg->css.cgroup;
1315 1315
1316 /* root ? */ 1316 /* root ? */
1317 if (cgrp->parent == NULL) 1317 if (cgrp->parent == NULL)
1318 return vm_swappiness; 1318 return vm_swappiness;
1319 1319
1320 return memcg->swappiness; 1320 return memcg->swappiness;
1321 } 1321 }
1322 1322
1323 static void mem_cgroup_start_move(struct mem_cgroup *mem) 1323 static void mem_cgroup_start_move(struct mem_cgroup *mem)
1324 { 1324 {
1325 int cpu; 1325 int cpu;
1326 1326
1327 get_online_cpus(); 1327 get_online_cpus();
1328 spin_lock(&mem->pcp_counter_lock); 1328 spin_lock(&mem->pcp_counter_lock);
1329 for_each_online_cpu(cpu) 1329 for_each_online_cpu(cpu)
1330 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1330 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1331 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1331 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1332 spin_unlock(&mem->pcp_counter_lock); 1332 spin_unlock(&mem->pcp_counter_lock);
1333 put_online_cpus(); 1333 put_online_cpus();
1334 1334
1335 synchronize_rcu(); 1335 synchronize_rcu();
1336 } 1336 }
1337 1337
1338 static void mem_cgroup_end_move(struct mem_cgroup *mem) 1338 static void mem_cgroup_end_move(struct mem_cgroup *mem)
1339 { 1339 {
1340 int cpu; 1340 int cpu;
1341 1341
1342 if (!mem) 1342 if (!mem)
1343 return; 1343 return;
1344 get_online_cpus(); 1344 get_online_cpus();
1345 spin_lock(&mem->pcp_counter_lock); 1345 spin_lock(&mem->pcp_counter_lock);
1346 for_each_online_cpu(cpu) 1346 for_each_online_cpu(cpu)
1347 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1347 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1348 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1348 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1349 spin_unlock(&mem->pcp_counter_lock); 1349 spin_unlock(&mem->pcp_counter_lock);
1350 put_online_cpus(); 1350 put_online_cpus();
1351 } 1351 }
1352 /* 1352 /*
1353 * 2 routines for checking "mem" is under move_account() or not. 1353 * 2 routines for checking "mem" is under move_account() or not.
1354 * 1354 *
1355 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1355 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1356 * for avoiding race in accounting. If true, 1356 * for avoiding race in accounting. If true,
1357 * pc->mem_cgroup may be overwritten. 1357 * pc->mem_cgroup may be overwritten.
1358 * 1358 *
1359 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1359 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1360 * under hierarchy of moving cgroups. This is for 1360 * under hierarchy of moving cgroups. This is for
1361 * waiting at hith-memory prressure caused by "move". 1361 * waiting at hith-memory prressure caused by "move".
1362 */ 1362 */
1363 1363
1364 static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1364 static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1365 { 1365 {
1366 VM_BUG_ON(!rcu_read_lock_held()); 1366 VM_BUG_ON(!rcu_read_lock_held());
1367 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1367 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1368 } 1368 }
1369 1369
1370 static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1370 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1371 { 1371 {
1372 struct mem_cgroup *from; 1372 struct mem_cgroup *from;
1373 struct mem_cgroup *to; 1373 struct mem_cgroup *to;
1374 bool ret = false; 1374 bool ret = false;
1375 /* 1375 /*
1376 * Unlike task_move routines, we access mc.to, mc.from not under 1376 * Unlike task_move routines, we access mc.to, mc.from not under
1377 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1377 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1378 */ 1378 */
1379 spin_lock(&mc.lock); 1379 spin_lock(&mc.lock);
1380 from = mc.from; 1380 from = mc.from;
1381 to = mc.to; 1381 to = mc.to;
1382 if (!from) 1382 if (!from)
1383 goto unlock; 1383 goto unlock;
1384 1384
1385 ret = mem_cgroup_same_or_subtree(mem, from) 1385 ret = mem_cgroup_same_or_subtree(mem, from)
1386 || mem_cgroup_same_or_subtree(mem, to); 1386 || mem_cgroup_same_or_subtree(mem, to);
1387 unlock: 1387 unlock:
1388 spin_unlock(&mc.lock); 1388 spin_unlock(&mc.lock);
1389 return ret; 1389 return ret;
1390 } 1390 }
1391 1391
1392 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1392 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1393 { 1393 {
1394 if (mc.moving_task && current != mc.moving_task) { 1394 if (mc.moving_task && current != mc.moving_task) {
1395 if (mem_cgroup_under_move(mem)) { 1395 if (mem_cgroup_under_move(mem)) {
1396 DEFINE_WAIT(wait); 1396 DEFINE_WAIT(wait);
1397 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1397 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1398 /* moving charge context might have finished. */ 1398 /* moving charge context might have finished. */
1399 if (mc.moving_task) 1399 if (mc.moving_task)
1400 schedule(); 1400 schedule();
1401 finish_wait(&mc.waitq, &wait); 1401 finish_wait(&mc.waitq, &wait);
1402 return true; 1402 return true;
1403 } 1403 }
1404 } 1404 }
1405 return false; 1405 return false;
1406 } 1406 }
1407 1407
1408 /** 1408 /**
1409 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1409 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1410 * @memcg: The memory cgroup that went over limit 1410 * @memcg: The memory cgroup that went over limit
1411 * @p: Task that is going to be killed 1411 * @p: Task that is going to be killed
1412 * 1412 *
1413 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1413 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1414 * enabled 1414 * enabled
1415 */ 1415 */
1416 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1416 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1417 { 1417 {
1418 struct cgroup *task_cgrp; 1418 struct cgroup *task_cgrp;
1419 struct cgroup *mem_cgrp; 1419 struct cgroup *mem_cgrp;
1420 /* 1420 /*
1421 * Need a buffer in BSS, can't rely on allocations. The code relies 1421 * Need a buffer in BSS, can't rely on allocations. The code relies
1422 * on the assumption that OOM is serialized for memory controller. 1422 * on the assumption that OOM is serialized for memory controller.
1423 * If this assumption is broken, revisit this code. 1423 * If this assumption is broken, revisit this code.
1424 */ 1424 */
1425 static char memcg_name[PATH_MAX]; 1425 static char memcg_name[PATH_MAX];
1426 int ret; 1426 int ret;
1427 1427
1428 if (!memcg || !p) 1428 if (!memcg || !p)
1429 return; 1429 return;
1430 1430
1431 1431
1432 rcu_read_lock(); 1432 rcu_read_lock();
1433 1433
1434 mem_cgrp = memcg->css.cgroup; 1434 mem_cgrp = memcg->css.cgroup;
1435 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1435 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1436 1436
1437 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1437 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1438 if (ret < 0) { 1438 if (ret < 0) {
1439 /* 1439 /*
1440 * Unfortunately, we are unable to convert to a useful name 1440 * Unfortunately, we are unable to convert to a useful name
1441 * But we'll still print out the usage information 1441 * But we'll still print out the usage information
1442 */ 1442 */
1443 rcu_read_unlock(); 1443 rcu_read_unlock();
1444 goto done; 1444 goto done;
1445 } 1445 }
1446 rcu_read_unlock(); 1446 rcu_read_unlock();
1447 1447
1448 printk(KERN_INFO "Task in %s killed", memcg_name); 1448 printk(KERN_INFO "Task in %s killed", memcg_name);
1449 1449
1450 rcu_read_lock(); 1450 rcu_read_lock();
1451 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1451 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1452 if (ret < 0) { 1452 if (ret < 0) {
1453 rcu_read_unlock(); 1453 rcu_read_unlock();
1454 goto done; 1454 goto done;
1455 } 1455 }
1456 rcu_read_unlock(); 1456 rcu_read_unlock();
1457 1457
1458 /* 1458 /*
1459 * Continues from above, so we don't need an KERN_ level 1459 * Continues from above, so we don't need an KERN_ level
1460 */ 1460 */
1461 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1461 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1462 done: 1462 done:
1463 1463
1464 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1464 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1465 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1465 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1466 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1466 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1467 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1467 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1468 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1468 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1469 "failcnt %llu\n", 1469 "failcnt %llu\n",
1470 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1470 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1471 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1471 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1472 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1472 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1473 } 1473 }
1474 1474
1475 /* 1475 /*
1476 * This function returns the number of memcg under hierarchy tree. Returns 1476 * This function returns the number of memcg under hierarchy tree. Returns
1477 * 1(self count) if no children. 1477 * 1(self count) if no children.
1478 */ 1478 */
1479 static int mem_cgroup_count_children(struct mem_cgroup *mem) 1479 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1480 { 1480 {
1481 int num = 0; 1481 int num = 0;
1482 struct mem_cgroup *iter; 1482 struct mem_cgroup *iter;
1483 1483
1484 for_each_mem_cgroup_tree(iter, mem) 1484 for_each_mem_cgroup_tree(iter, mem)
1485 num++; 1485 num++;
1486 return num; 1486 return num;
1487 } 1487 }
1488 1488
1489 /* 1489 /*
1490 * Return the memory (and swap, if configured) limit for a memcg. 1490 * Return the memory (and swap, if configured) limit for a memcg.
1491 */ 1491 */
1492 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1492 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1493 { 1493 {
1494 u64 limit; 1494 u64 limit;
1495 u64 memsw; 1495 u64 memsw;
1496 1496
1497 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1497 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1498 limit += total_swap_pages << PAGE_SHIFT; 1498 limit += total_swap_pages << PAGE_SHIFT;
1499 1499
1500 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1500 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1501 /* 1501 /*
1502 * If memsw is finite and limits the amount of swap space available 1502 * If memsw is finite and limits the amount of swap space available
1503 * to this memcg, return that limit. 1503 * to this memcg, return that limit.
1504 */ 1504 */
1505 return min(limit, memsw); 1505 return min(limit, memsw);
1506 } 1506 }
1507 1507
1508 /* 1508 /*
1509 * Visit the first child (need not be the first child as per the ordering 1509 * Visit the first child (need not be the first child as per the ordering
1510 * of the cgroup list, since we track last_scanned_child) of @mem and use 1510 * of the cgroup list, since we track last_scanned_child) of @mem and use
1511 * that to reclaim free pages from. 1511 * that to reclaim free pages from.
1512 */ 1512 */
1513 static struct mem_cgroup * 1513 static struct mem_cgroup *
1514 mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1514 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1515 { 1515 {
1516 struct mem_cgroup *ret = NULL; 1516 struct mem_cgroup *ret = NULL;
1517 struct cgroup_subsys_state *css; 1517 struct cgroup_subsys_state *css;
1518 int nextid, found; 1518 int nextid, found;
1519 1519
1520 if (!root_mem->use_hierarchy) { 1520 if (!root_mem->use_hierarchy) {
1521 css_get(&root_mem->css); 1521 css_get(&root_mem->css);
1522 ret = root_mem; 1522 ret = root_mem;
1523 } 1523 }
1524 1524
1525 while (!ret) { 1525 while (!ret) {
1526 rcu_read_lock(); 1526 rcu_read_lock();
1527 nextid = root_mem->last_scanned_child + 1; 1527 nextid = root_mem->last_scanned_child + 1;
1528 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1528 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1529 &found); 1529 &found);
1530 if (css && css_tryget(css)) 1530 if (css && css_tryget(css))
1531 ret = container_of(css, struct mem_cgroup, css); 1531 ret = container_of(css, struct mem_cgroup, css);
1532 1532
1533 rcu_read_unlock(); 1533 rcu_read_unlock();
1534 /* Updates scanning parameter */ 1534 /* Updates scanning parameter */
1535 if (!css) { 1535 if (!css) {
1536 /* this means start scan from ID:1 */ 1536 /* this means start scan from ID:1 */
1537 root_mem->last_scanned_child = 0; 1537 root_mem->last_scanned_child = 0;
1538 } else 1538 } else
1539 root_mem->last_scanned_child = found; 1539 root_mem->last_scanned_child = found;
1540 } 1540 }
1541 1541
1542 return ret; 1542 return ret;
1543 } 1543 }
1544 1544
1545 /** 1545 /**
1546 * test_mem_cgroup_node_reclaimable 1546 * test_mem_cgroup_node_reclaimable
1547 * @mem: the target memcg 1547 * @mem: the target memcg
1548 * @nid: the node ID to be checked. 1548 * @nid: the node ID to be checked.
1549 * @noswap : specify true here if the user wants flle only information. 1549 * @noswap : specify true here if the user wants flle only information.
1550 * 1550 *
1551 * This function returns whether the specified memcg contains any 1551 * This function returns whether the specified memcg contains any
1552 * reclaimable pages on a node. Returns true if there are any reclaimable 1552 * reclaimable pages on a node. Returns true if there are any reclaimable
1553 * pages in the node. 1553 * pages in the node.
1554 */ 1554 */
1555 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, 1555 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
1556 int nid, bool noswap) 1556 int nid, bool noswap)
1557 { 1557 {
1558 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) 1558 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
1559 return true; 1559 return true;
1560 if (noswap || !total_swap_pages) 1560 if (noswap || !total_swap_pages)
1561 return false; 1561 return false;
1562 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) 1562 if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
1563 return true; 1563 return true;
1564 return false; 1564 return false;
1565 1565
1566 } 1566 }
1567 #if MAX_NUMNODES > 1 1567 #if MAX_NUMNODES > 1
1568 1568
1569 /* 1569 /*
1570 * Always updating the nodemask is not very good - even if we have an empty 1570 * Always updating the nodemask is not very good - even if we have an empty
1571 * list or the wrong list here, we can start from some node and traverse all 1571 * list or the wrong list here, we can start from some node and traverse all
1572 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1572 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1573 * 1573 *
1574 */ 1574 */
1575 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) 1575 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1576 { 1576 {
1577 int nid; 1577 int nid;
1578 /* 1578 /*
1579 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1579 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1580 * pagein/pageout changes since the last update. 1580 * pagein/pageout changes since the last update.
1581 */ 1581 */
1582 if (!atomic_read(&mem->numainfo_events)) 1582 if (!atomic_read(&mem->numainfo_events))
1583 return; 1583 return;
1584 if (atomic_inc_return(&mem->numainfo_updating) > 1) 1584 if (atomic_inc_return(&mem->numainfo_updating) > 1)
1585 return; 1585 return;
1586 1586
1587 /* make a nodemask where this memcg uses memory from */ 1587 /* make a nodemask where this memcg uses memory from */
1588 mem->scan_nodes = node_states[N_HIGH_MEMORY]; 1588 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1589 1589
1590 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { 1590 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1591 1591
1592 if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) 1592 if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
1593 node_clear(nid, mem->scan_nodes); 1593 node_clear(nid, mem->scan_nodes);
1594 } 1594 }
1595 1595
1596 atomic_set(&mem->numainfo_events, 0); 1596 atomic_set(&mem->numainfo_events, 0);
1597 atomic_set(&mem->numainfo_updating, 0); 1597 atomic_set(&mem->numainfo_updating, 0);
1598 } 1598 }
1599 1599
1600 /* 1600 /*
1601 * Selecting a node where we start reclaim from. Because what we need is just 1601 * Selecting a node where we start reclaim from. Because what we need is just
1602 * reducing usage counter, start from anywhere is O,K. Considering 1602 * reducing usage counter, start from anywhere is O,K. Considering
1603 * memory reclaim from current node, there are pros. and cons. 1603 * memory reclaim from current node, there are pros. and cons.
1604 * 1604 *
1605 * Freeing memory from current node means freeing memory from a node which 1605 * Freeing memory from current node means freeing memory from a node which
1606 * we'll use or we've used. So, it may make LRU bad. And if several threads 1606 * we'll use or we've used. So, it may make LRU bad. And if several threads
1607 * hit limits, it will see a contention on a node. But freeing from remote 1607 * hit limits, it will see a contention on a node. But freeing from remote
1608 * node means more costs for memory reclaim because of memory latency. 1608 * node means more costs for memory reclaim because of memory latency.
1609 * 1609 *
1610 * Now, we use round-robin. Better algorithm is welcomed. 1610 * Now, we use round-robin. Better algorithm is welcomed.
1611 */ 1611 */
1612 int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1612 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1613 { 1613 {
1614 int node; 1614 int node;
1615 1615
1616 mem_cgroup_may_update_nodemask(mem); 1616 mem_cgroup_may_update_nodemask(mem);
1617 node = mem->last_scanned_node; 1617 node = mem->last_scanned_node;
1618 1618
1619 node = next_node(node, mem->scan_nodes); 1619 node = next_node(node, mem->scan_nodes);
1620 if (node == MAX_NUMNODES) 1620 if (node == MAX_NUMNODES)
1621 node = first_node(mem->scan_nodes); 1621 node = first_node(mem->scan_nodes);
1622 /* 1622 /*
1623 * We call this when we hit limit, not when pages are added to LRU. 1623 * We call this when we hit limit, not when pages are added to LRU.
1624 * No LRU may hold pages because all pages are UNEVICTABLE or 1624 * No LRU may hold pages because all pages are UNEVICTABLE or
1625 * memcg is too small and all pages are not on LRU. In that case, 1625 * memcg is too small and all pages are not on LRU. In that case,
1626 * we use curret node. 1626 * we use curret node.
1627 */ 1627 */
1628 if (unlikely(node == MAX_NUMNODES)) 1628 if (unlikely(node == MAX_NUMNODES))
1629 node = numa_node_id(); 1629 node = numa_node_id();
1630 1630
1631 mem->last_scanned_node = node; 1631 mem->last_scanned_node = node;
1632 return node; 1632 return node;
1633 } 1633 }
1634 1634
1635 /* 1635 /*
1636 * Check all nodes whether it contains reclaimable pages or not. 1636 * Check all nodes whether it contains reclaimable pages or not.
1637 * For quick scan, we make use of scan_nodes. This will allow us to skip 1637 * For quick scan, we make use of scan_nodes. This will allow us to skip
1638 * unused nodes. But scan_nodes is lazily updated and may not cotain 1638 * unused nodes. But scan_nodes is lazily updated and may not cotain
1639 * enough new information. We need to do double check. 1639 * enough new information. We need to do double check.
1640 */ 1640 */
1641 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1641 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1642 { 1642 {
1643 int nid; 1643 int nid;
1644 1644
1645 /* 1645 /*
1646 * quick check...making use of scan_node. 1646 * quick check...making use of scan_node.
1647 * We can skip unused nodes. 1647 * We can skip unused nodes.
1648 */ 1648 */
1649 if (!nodes_empty(mem->scan_nodes)) { 1649 if (!nodes_empty(mem->scan_nodes)) {
1650 for (nid = first_node(mem->scan_nodes); 1650 for (nid = first_node(mem->scan_nodes);
1651 nid < MAX_NUMNODES; 1651 nid < MAX_NUMNODES;
1652 nid = next_node(nid, mem->scan_nodes)) { 1652 nid = next_node(nid, mem->scan_nodes)) {
1653 1653
1654 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1654 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1655 return true; 1655 return true;
1656 } 1656 }
1657 } 1657 }
1658 /* 1658 /*
1659 * Check rest of nodes. 1659 * Check rest of nodes.
1660 */ 1660 */
1661 for_each_node_state(nid, N_HIGH_MEMORY) { 1661 for_each_node_state(nid, N_HIGH_MEMORY) {
1662 if (node_isset(nid, mem->scan_nodes)) 1662 if (node_isset(nid, mem->scan_nodes))
1663 continue; 1663 continue;
1664 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) 1664 if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
1665 return true; 1665 return true;
1666 } 1666 }
1667 return false; 1667 return false;
1668 } 1668 }
1669 1669
1670 #else 1670 #else
1671 int mem_cgroup_select_victim_node(struct mem_cgroup *mem) 1671 int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1672 { 1672 {
1673 return 0; 1673 return 0;
1674 } 1674 }
1675 1675
1676 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) 1676 bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
1677 { 1677 {
1678 return test_mem_cgroup_node_reclaimable(mem, 0, noswap); 1678 return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
1679 } 1679 }
1680 #endif 1680 #endif
1681 1681
1682 static void __mem_cgroup_record_scanstat(unsigned long *stats, 1682 static void __mem_cgroup_record_scanstat(unsigned long *stats,
1683 struct memcg_scanrecord *rec) 1683 struct memcg_scanrecord *rec)
1684 { 1684 {
1685 1685
1686 stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; 1686 stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
1687 stats[SCAN_ANON] += rec->nr_scanned[0]; 1687 stats[SCAN_ANON] += rec->nr_scanned[0];
1688 stats[SCAN_FILE] += rec->nr_scanned[1]; 1688 stats[SCAN_FILE] += rec->nr_scanned[1];
1689 1689
1690 stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; 1690 stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
1691 stats[ROTATE_ANON] += rec->nr_rotated[0]; 1691 stats[ROTATE_ANON] += rec->nr_rotated[0];
1692 stats[ROTATE_FILE] += rec->nr_rotated[1]; 1692 stats[ROTATE_FILE] += rec->nr_rotated[1];
1693 1693
1694 stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; 1694 stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
1695 stats[FREED_ANON] += rec->nr_freed[0]; 1695 stats[FREED_ANON] += rec->nr_freed[0];
1696 stats[FREED_FILE] += rec->nr_freed[1]; 1696 stats[FREED_FILE] += rec->nr_freed[1];
1697 1697
1698 stats[ELAPSED] += rec->elapsed; 1698 stats[ELAPSED] += rec->elapsed;
1699 } 1699 }
1700 1700
1701 static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) 1701 static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
1702 { 1702 {
1703 struct mem_cgroup *mem; 1703 struct mem_cgroup *mem;
1704 int context = rec->context; 1704 int context = rec->context;
1705 1705
1706 if (context >= NR_SCAN_CONTEXT) 1706 if (context >= NR_SCAN_CONTEXT)
1707 return; 1707 return;
1708 1708
1709 mem = rec->mem; 1709 mem = rec->mem;
1710 spin_lock(&mem->scanstat.lock); 1710 spin_lock(&mem->scanstat.lock);
1711 __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); 1711 __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
1712 spin_unlock(&mem->scanstat.lock); 1712 spin_unlock(&mem->scanstat.lock);
1713 1713
1714 mem = rec->root; 1714 mem = rec->root;
1715 spin_lock(&mem->scanstat.lock); 1715 spin_lock(&mem->scanstat.lock);
1716 __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); 1716 __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
1717 spin_unlock(&mem->scanstat.lock); 1717 spin_unlock(&mem->scanstat.lock);
1718 } 1718 }
1719 1719
1720 /* 1720 /*
1721 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1721 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1722 * we reclaimed from, so that we don't end up penalizing one child extensively 1722 * we reclaimed from, so that we don't end up penalizing one child extensively
1723 * based on its position in the children list. 1723 * based on its position in the children list.
1724 * 1724 *
1725 * root_mem is the original ancestor that we've been reclaim from. 1725 * root_mem is the original ancestor that we've been reclaim from.
1726 * 1726 *
1727 * We give up and return to the caller when we visit root_mem twice. 1727 * We give up and return to the caller when we visit root_mem twice.
1728 * (other groups can be removed while we're walking....) 1728 * (other groups can be removed while we're walking....)
1729 * 1729 *
1730 * If shrink==true, for avoiding to free too much, this returns immedieately. 1730 * If shrink==true, for avoiding to free too much, this returns immedieately.
1731 */ 1731 */
1732 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1732 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1733 struct zone *zone, 1733 struct zone *zone,
1734 gfp_t gfp_mask, 1734 gfp_t gfp_mask,
1735 unsigned long reclaim_options, 1735 unsigned long reclaim_options,
1736 unsigned long *total_scanned) 1736 unsigned long *total_scanned)
1737 { 1737 {
1738 struct mem_cgroup *victim; 1738 struct mem_cgroup *victim;
1739 int ret, total = 0; 1739 int ret, total = 0;
1740 int loop = 0; 1740 int loop = 0;
1741 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1741 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1742 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1742 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1743 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1743 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1744 struct memcg_scanrecord rec; 1744 struct memcg_scanrecord rec;
1745 unsigned long excess; 1745 unsigned long excess;
1746 unsigned long scanned; 1746 unsigned long scanned;
1747 1747
1748 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1748 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1749 1749
1750 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1750 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1751 if (!check_soft && !shrink && root_mem->memsw_is_minimum) 1751 if (!check_soft && !shrink && root_mem->memsw_is_minimum)
1752 noswap = true; 1752 noswap = true;
1753 1753
1754 if (shrink) 1754 if (shrink)
1755 rec.context = SCAN_BY_SHRINK; 1755 rec.context = SCAN_BY_SHRINK;
1756 else if (check_soft) 1756 else if (check_soft)
1757 rec.context = SCAN_BY_SYSTEM; 1757 rec.context = SCAN_BY_SYSTEM;
1758 else 1758 else
1759 rec.context = SCAN_BY_LIMIT; 1759 rec.context = SCAN_BY_LIMIT;
1760 1760
1761 rec.root = root_mem; 1761 rec.root = root_mem;
1762 1762
1763 while (1) { 1763 while (1) {
1764 victim = mem_cgroup_select_victim(root_mem); 1764 victim = mem_cgroup_select_victim(root_mem);
1765 if (victim == root_mem) { 1765 if (victim == root_mem) {
1766 loop++; 1766 loop++;
1767 /* 1767 /*
1768 * We are not draining per cpu cached charges during 1768 * We are not draining per cpu cached charges during
1769 * soft limit reclaim because global reclaim doesn't 1769 * soft limit reclaim because global reclaim doesn't
1770 * care about charges. It tries to free some memory and 1770 * care about charges. It tries to free some memory and
1771 * charges will not give any. 1771 * charges will not give any.
1772 */ 1772 */
1773 if (!check_soft && loop >= 1) 1773 if (!check_soft && loop >= 1)
1774 drain_all_stock_async(root_mem); 1774 drain_all_stock_async(root_mem);
1775 if (loop >= 2) { 1775 if (loop >= 2) {
1776 /* 1776 /*
1777 * If we have not been able to reclaim 1777 * If we have not been able to reclaim
1778 * anything, it might because there are 1778 * anything, it might because there are
1779 * no reclaimable pages under this hierarchy 1779 * no reclaimable pages under this hierarchy
1780 */ 1780 */
1781 if (!check_soft || !total) { 1781 if (!check_soft || !total) {
1782 css_put(&victim->css); 1782 css_put(&victim->css);
1783 break; 1783 break;
1784 } 1784 }
1785 /* 1785 /*
1786 * We want to do more targeted reclaim. 1786 * We want to do more targeted reclaim.
1787 * excess >> 2 is not to excessive so as to 1787 * excess >> 2 is not to excessive so as to
1788 * reclaim too much, nor too less that we keep 1788 * reclaim too much, nor too less that we keep
1789 * coming back to reclaim from this cgroup 1789 * coming back to reclaim from this cgroup
1790 */ 1790 */
1791 if (total >= (excess >> 2) || 1791 if (total >= (excess >> 2) ||
1792 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1792 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1793 css_put(&victim->css); 1793 css_put(&victim->css);
1794 break; 1794 break;
1795 } 1795 }
1796 } 1796 }
1797 } 1797 }
1798 if (!mem_cgroup_reclaimable(victim, noswap)) { 1798 if (!mem_cgroup_reclaimable(victim, noswap)) {
1799 /* this cgroup's local usage == 0 */ 1799 /* this cgroup's local usage == 0 */
1800 css_put(&victim->css); 1800 css_put(&victim->css);
1801 continue; 1801 continue;
1802 } 1802 }
1803 rec.mem = victim; 1803 rec.mem = victim;
1804 rec.nr_scanned[0] = 0; 1804 rec.nr_scanned[0] = 0;
1805 rec.nr_scanned[1] = 0; 1805 rec.nr_scanned[1] = 0;
1806 rec.nr_rotated[0] = 0; 1806 rec.nr_rotated[0] = 0;
1807 rec.nr_rotated[1] = 0; 1807 rec.nr_rotated[1] = 0;
1808 rec.nr_freed[0] = 0; 1808 rec.nr_freed[0] = 0;
1809 rec.nr_freed[1] = 0; 1809 rec.nr_freed[1] = 0;
1810 rec.elapsed = 0; 1810 rec.elapsed = 0;
1811 /* we use swappiness of local cgroup */ 1811 /* we use swappiness of local cgroup */
1812 if (check_soft) { 1812 if (check_soft) {
1813 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1813 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1814 noswap, zone, &rec, &scanned); 1814 noswap, zone, &rec, &scanned);
1815 *total_scanned += scanned; 1815 *total_scanned += scanned;
1816 } else 1816 } else
1817 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1817 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1818 noswap, &rec); 1818 noswap, &rec);
1819 mem_cgroup_record_scanstat(&rec); 1819 mem_cgroup_record_scanstat(&rec);
1820 css_put(&victim->css); 1820 css_put(&victim->css);
1821 /* 1821 /*
1822 * At shrinking usage, we can't check we should stop here or 1822 * At shrinking usage, we can't check we should stop here or
1823 * reclaim more. It's depends on callers. last_scanned_child 1823 * reclaim more. It's depends on callers. last_scanned_child
1824 * will work enough for keeping fairness under tree. 1824 * will work enough for keeping fairness under tree.
1825 */ 1825 */
1826 if (shrink) 1826 if (shrink)
1827 return ret; 1827 return ret;
1828 total += ret; 1828 total += ret;
1829 if (check_soft) { 1829 if (check_soft) {
1830 if (!res_counter_soft_limit_excess(&root_mem->res)) 1830 if (!res_counter_soft_limit_excess(&root_mem->res))
1831 return total; 1831 return total;
1832 } else if (mem_cgroup_margin(root_mem)) 1832 } else if (mem_cgroup_margin(root_mem))
1833 return total; 1833 return total;
1834 } 1834 }
1835 return total; 1835 return total;
1836 } 1836 }
1837 1837
1838 /* 1838 /*
1839 * Check OOM-Killer is already running under our hierarchy. 1839 * Check OOM-Killer is already running under our hierarchy.
1840 * If someone is running, return false. 1840 * If someone is running, return false.
1841 * Has to be called with memcg_oom_lock 1841 * Has to be called with memcg_oom_lock
1842 */ 1842 */
1843 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1843 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1844 { 1844 {
1845 int lock_count = -1; 1845 int lock_count = -1;
1846 struct mem_cgroup *iter, *failed = NULL; 1846 struct mem_cgroup *iter, *failed = NULL;
1847 bool cond = true; 1847 bool cond = true;
1848 1848
1849 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1849 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1850 bool locked = iter->oom_lock; 1850 bool locked = iter->oom_lock;
1851 1851
1852 iter->oom_lock = true; 1852 iter->oom_lock = true;
1853 if (lock_count == -1) 1853 if (lock_count == -1)
1854 lock_count = iter->oom_lock; 1854 lock_count = iter->oom_lock;
1855 else if (lock_count != locked) { 1855 else if (lock_count != locked) {
1856 /* 1856 /*
1857 * this subtree of our hierarchy is already locked 1857 * this subtree of our hierarchy is already locked
1858 * so we cannot give a lock. 1858 * so we cannot give a lock.
1859 */ 1859 */
1860 lock_count = 0; 1860 lock_count = 0;
1861 failed = iter; 1861 failed = iter;
1862 cond = false; 1862 cond = false;
1863 } 1863 }
1864 } 1864 }
1865 1865
1866 if (!failed) 1866 if (!failed)
1867 goto done; 1867 goto done;
1868 1868
1869 /* 1869 /*
1870 * OK, we failed to lock the whole subtree so we have to clean up 1870 * OK, we failed to lock the whole subtree so we have to clean up
1871 * what we set up to the failing subtree 1871 * what we set up to the failing subtree
1872 */ 1872 */
1873 cond = true; 1873 cond = true;
1874 for_each_mem_cgroup_tree_cond(iter, mem, cond) { 1874 for_each_mem_cgroup_tree_cond(iter, mem, cond) {
1875 if (iter == failed) { 1875 if (iter == failed) {
1876 cond = false; 1876 cond = false;
1877 continue; 1877 continue;
1878 } 1878 }
1879 iter->oom_lock = false; 1879 iter->oom_lock = false;
1880 } 1880 }
1881 done: 1881 done:
1882 return lock_count; 1882 return lock_count;
1883 } 1883 }
1884 1884
1885 /* 1885 /*
1886 * Has to be called with memcg_oom_lock 1886 * Has to be called with memcg_oom_lock
1887 */ 1887 */
1888 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1888 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1889 { 1889 {
1890 struct mem_cgroup *iter; 1890 struct mem_cgroup *iter;
1891 1891
1892 for_each_mem_cgroup_tree(iter, mem) 1892 for_each_mem_cgroup_tree(iter, mem)
1893 iter->oom_lock = false; 1893 iter->oom_lock = false;
1894 return 0; 1894 return 0;
1895 } 1895 }
1896 1896
1897 static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) 1897 static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
1898 { 1898 {
1899 struct mem_cgroup *iter; 1899 struct mem_cgroup *iter;
1900 1900
1901 for_each_mem_cgroup_tree(iter, mem) 1901 for_each_mem_cgroup_tree(iter, mem)
1902 atomic_inc(&iter->under_oom); 1902 atomic_inc(&iter->under_oom);
1903 } 1903 }
1904 1904
1905 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) 1905 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
1906 { 1906 {
1907 struct mem_cgroup *iter; 1907 struct mem_cgroup *iter;
1908 1908
1909 /* 1909 /*
1910 * When a new child is created while the hierarchy is under oom, 1910 * When a new child is created while the hierarchy is under oom,
1911 * mem_cgroup_oom_lock() may not be called. We have to use 1911 * mem_cgroup_oom_lock() may not be called. We have to use
1912 * atomic_add_unless() here. 1912 * atomic_add_unless() here.
1913 */ 1913 */
1914 for_each_mem_cgroup_tree(iter, mem) 1914 for_each_mem_cgroup_tree(iter, mem)
1915 atomic_add_unless(&iter->under_oom, -1, 0); 1915 atomic_add_unless(&iter->under_oom, -1, 0);
1916 } 1916 }
1917 1917
1918 static DEFINE_SPINLOCK(memcg_oom_lock); 1918 static DEFINE_SPINLOCK(memcg_oom_lock);
1919 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1919 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1920 1920
1921 struct oom_wait_info { 1921 struct oom_wait_info {
1922 struct mem_cgroup *mem; 1922 struct mem_cgroup *mem;
1923 wait_queue_t wait; 1923 wait_queue_t wait;
1924 }; 1924 };
1925 1925
1926 static int memcg_oom_wake_function(wait_queue_t *wait, 1926 static int memcg_oom_wake_function(wait_queue_t *wait,
1927 unsigned mode, int sync, void *arg) 1927 unsigned mode, int sync, void *arg)
1928 { 1928 {
1929 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, 1929 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
1930 *oom_wait_mem; 1930 *oom_wait_mem;
1931 struct oom_wait_info *oom_wait_info; 1931 struct oom_wait_info *oom_wait_info;
1932 1932
1933 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1933 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1934 oom_wait_mem = oom_wait_info->mem; 1934 oom_wait_mem = oom_wait_info->mem;
1935 1935
1936 /* 1936 /*
1937 * Both of oom_wait_info->mem and wake_mem are stable under us. 1937 * Both of oom_wait_info->mem and wake_mem are stable under us.
1938 * Then we can use css_is_ancestor without taking care of RCU. 1938 * Then we can use css_is_ancestor without taking care of RCU.
1939 */ 1939 */
1940 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) 1940 if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
1941 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) 1941 && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
1942 return 0; 1942 return 0;
1943 return autoremove_wake_function(wait, mode, sync, arg); 1943 return autoremove_wake_function(wait, mode, sync, arg);
1944 } 1944 }
1945 1945
1946 static void memcg_wakeup_oom(struct mem_cgroup *mem) 1946 static void memcg_wakeup_oom(struct mem_cgroup *mem)
1947 { 1947 {
1948 /* for filtering, pass "mem" as argument. */ 1948 /* for filtering, pass "mem" as argument. */
1949 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1949 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1950 } 1950 }
1951 1951
1952 static void memcg_oom_recover(struct mem_cgroup *mem) 1952 static void memcg_oom_recover(struct mem_cgroup *mem)
1953 { 1953 {
1954 if (mem && atomic_read(&mem->under_oom)) 1954 if (mem && atomic_read(&mem->under_oom))
1955 memcg_wakeup_oom(mem); 1955 memcg_wakeup_oom(mem);
1956 } 1956 }
1957 1957
1958 /* 1958 /*
1959 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1959 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1960 */ 1960 */
1961 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1961 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1962 { 1962 {
1963 struct oom_wait_info owait; 1963 struct oom_wait_info owait;
1964 bool locked, need_to_kill; 1964 bool locked, need_to_kill;
1965 1965
1966 owait.mem = mem; 1966 owait.mem = mem;
1967 owait.wait.flags = 0; 1967 owait.wait.flags = 0;
1968 owait.wait.func = memcg_oom_wake_function; 1968 owait.wait.func = memcg_oom_wake_function;
1969 owait.wait.private = current; 1969 owait.wait.private = current;
1970 INIT_LIST_HEAD(&owait.wait.task_list); 1970 INIT_LIST_HEAD(&owait.wait.task_list);
1971 need_to_kill = true; 1971 need_to_kill = true;
1972 mem_cgroup_mark_under_oom(mem); 1972 mem_cgroup_mark_under_oom(mem);
1973 1973
1974 /* At first, try to OOM lock hierarchy under mem.*/ 1974 /* At first, try to OOM lock hierarchy under mem.*/
1975 spin_lock(&memcg_oom_lock); 1975 spin_lock(&memcg_oom_lock);
1976 locked = mem_cgroup_oom_lock(mem); 1976 locked = mem_cgroup_oom_lock(mem);
1977 /* 1977 /*
1978 * Even if signal_pending(), we can't quit charge() loop without 1978 * Even if signal_pending(), we can't quit charge() loop without
1979 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1979 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1980 * under OOM is always welcomed, use TASK_KILLABLE here. 1980 * under OOM is always welcomed, use TASK_KILLABLE here.
1981 */ 1981 */
1982 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1982 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1983 if (!locked || mem->oom_kill_disable) 1983 if (!locked || mem->oom_kill_disable)
1984 need_to_kill = false; 1984 need_to_kill = false;
1985 if (locked) 1985 if (locked)
1986 mem_cgroup_oom_notify(mem); 1986 mem_cgroup_oom_notify(mem);
1987 spin_unlock(&memcg_oom_lock); 1987 spin_unlock(&memcg_oom_lock);
1988 1988
1989 if (need_to_kill) { 1989 if (need_to_kill) {
1990 finish_wait(&memcg_oom_waitq, &owait.wait); 1990 finish_wait(&memcg_oom_waitq, &owait.wait);
1991 mem_cgroup_out_of_memory(mem, mask); 1991 mem_cgroup_out_of_memory(mem, mask);
1992 } else { 1992 } else {
1993 schedule(); 1993 schedule();
1994 finish_wait(&memcg_oom_waitq, &owait.wait); 1994 finish_wait(&memcg_oom_waitq, &owait.wait);
1995 } 1995 }
1996 spin_lock(&memcg_oom_lock); 1996 spin_lock(&memcg_oom_lock);
1997 if (locked) 1997 if (locked)
1998 mem_cgroup_oom_unlock(mem); 1998 mem_cgroup_oom_unlock(mem);
1999 memcg_wakeup_oom(mem); 1999 memcg_wakeup_oom(mem);
2000 spin_unlock(&memcg_oom_lock); 2000 spin_unlock(&memcg_oom_lock);
2001 2001
2002 mem_cgroup_unmark_under_oom(mem); 2002 mem_cgroup_unmark_under_oom(mem);
2003 2003
2004 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2004 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2005 return false; 2005 return false;
2006 /* Give chance to dying process */ 2006 /* Give chance to dying process */
2007 schedule_timeout(1); 2007 schedule_timeout(1);
2008 return true; 2008 return true;
2009 } 2009 }
2010 2010
2011 /* 2011 /*
2012 * Currently used to update mapped file statistics, but the routine can be 2012 * Currently used to update mapped file statistics, but the routine can be
2013 * generalized to update other statistics as well. 2013 * generalized to update other statistics as well.
2014 * 2014 *
2015 * Notes: Race condition 2015 * Notes: Race condition
2016 * 2016 *
2017 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2017 * We usually use page_cgroup_lock() for accessing page_cgroup member but
2018 * it tends to be costly. But considering some conditions, we doesn't need 2018 * it tends to be costly. But considering some conditions, we doesn't need
2019 * to do so _always_. 2019 * to do so _always_.
2020 * 2020 *
2021 * Considering "charge", lock_page_cgroup() is not required because all 2021 * Considering "charge", lock_page_cgroup() is not required because all
2022 * file-stat operations happen after a page is attached to radix-tree. There 2022 * file-stat operations happen after a page is attached to radix-tree. There
2023 * are no race with "charge". 2023 * are no race with "charge".
2024 * 2024 *
2025 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2025 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
2026 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 2026 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
2027 * if there are race with "uncharge". Statistics itself is properly handled 2027 * if there are race with "uncharge". Statistics itself is properly handled
2028 * by flags. 2028 * by flags.
2029 * 2029 *
2030 * Considering "move", this is an only case we see a race. To make the race 2030 * Considering "move", this is an only case we see a race. To make the race
2031 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 2031 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
2032 * possibility of race condition. If there is, we take a lock. 2032 * possibility of race condition. If there is, we take a lock.
2033 */ 2033 */
2034 2034
2035 void mem_cgroup_update_page_stat(struct page *page, 2035 void mem_cgroup_update_page_stat(struct page *page,
2036 enum mem_cgroup_page_stat_item idx, int val) 2036 enum mem_cgroup_page_stat_item idx, int val)
2037 { 2037 {
2038 struct mem_cgroup *mem; 2038 struct mem_cgroup *mem;
2039 struct page_cgroup *pc = lookup_page_cgroup(page); 2039 struct page_cgroup *pc = lookup_page_cgroup(page);
2040 bool need_unlock = false; 2040 bool need_unlock = false;
2041 unsigned long uninitialized_var(flags); 2041 unsigned long uninitialized_var(flags);
2042 2042
2043 if (unlikely(!pc)) 2043 if (unlikely(!pc))
2044 return; 2044 return;
2045 2045
2046 rcu_read_lock(); 2046 rcu_read_lock();
2047 mem = pc->mem_cgroup; 2047 mem = pc->mem_cgroup;
2048 if (unlikely(!mem || !PageCgroupUsed(pc))) 2048 if (unlikely(!mem || !PageCgroupUsed(pc)))
2049 goto out; 2049 goto out;
2050 /* pc->mem_cgroup is unstable ? */ 2050 /* pc->mem_cgroup is unstable ? */
2051 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 2051 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
2052 /* take a lock against to access pc->mem_cgroup */ 2052 /* take a lock against to access pc->mem_cgroup */
2053 move_lock_page_cgroup(pc, &flags); 2053 move_lock_page_cgroup(pc, &flags);
2054 need_unlock = true; 2054 need_unlock = true;
2055 mem = pc->mem_cgroup; 2055 mem = pc->mem_cgroup;
2056 if (!mem || !PageCgroupUsed(pc)) 2056 if (!mem || !PageCgroupUsed(pc))
2057 goto out; 2057 goto out;
2058 } 2058 }
2059 2059
2060 switch (idx) { 2060 switch (idx) {
2061 case MEMCG_NR_FILE_MAPPED: 2061 case MEMCG_NR_FILE_MAPPED:
2062 if (val > 0) 2062 if (val > 0)
2063 SetPageCgroupFileMapped(pc); 2063 SetPageCgroupFileMapped(pc);
2064 else if (!page_mapped(page)) 2064 else if (!page_mapped(page))
2065 ClearPageCgroupFileMapped(pc); 2065 ClearPageCgroupFileMapped(pc);
2066 idx = MEM_CGROUP_STAT_FILE_MAPPED; 2066 idx = MEM_CGROUP_STAT_FILE_MAPPED;
2067 break; 2067 break;
2068 default: 2068 default:
2069 BUG(); 2069 BUG();
2070 } 2070 }
2071 2071
2072 this_cpu_add(mem->stat->count[idx], val); 2072 this_cpu_add(mem->stat->count[idx], val);
2073 2073
2074 out: 2074 out:
2075 if (unlikely(need_unlock)) 2075 if (unlikely(need_unlock))
2076 move_unlock_page_cgroup(pc, &flags); 2076 move_unlock_page_cgroup(pc, &flags);
2077 rcu_read_unlock(); 2077 rcu_read_unlock();
2078 return; 2078 return;
2079 } 2079 }
2080 EXPORT_SYMBOL(mem_cgroup_update_page_stat); 2080 EXPORT_SYMBOL(mem_cgroup_update_page_stat);
2081 2081
2082 /* 2082 /*
2083 * size of first charge trial. "32" comes from vmscan.c's magic value. 2083 * size of first charge trial. "32" comes from vmscan.c's magic value.
2084 * TODO: maybe necessary to use big numbers in big irons. 2084 * TODO: maybe necessary to use big numbers in big irons.
2085 */ 2085 */
2086 #define CHARGE_BATCH 32U 2086 #define CHARGE_BATCH 32U
2087 struct memcg_stock_pcp { 2087 struct memcg_stock_pcp {
2088 struct mem_cgroup *cached; /* this never be root cgroup */ 2088 struct mem_cgroup *cached; /* this never be root cgroup */
2089 unsigned int nr_pages; 2089 unsigned int nr_pages;
2090 struct work_struct work; 2090 struct work_struct work;
2091 unsigned long flags; 2091 unsigned long flags;
2092 #define FLUSHING_CACHED_CHARGE (0) 2092 #define FLUSHING_CACHED_CHARGE (0)
2093 }; 2093 };
2094 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2094 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2095 static DEFINE_MUTEX(percpu_charge_mutex);
2096 2095
2097 /* 2096 /*
2098 * Try to consume stocked charge on this cpu. If success, one page is consumed 2097 * Try to consume stocked charge on this cpu. If success, one page is consumed
2099 * from local stock and true is returned. If the stock is 0 or charges from a 2098 * from local stock and true is returned. If the stock is 0 or charges from a
2100 * cgroup which is not current target, returns false. This stock will be 2099 * cgroup which is not current target, returns false. This stock will be
2101 * refilled. 2100 * refilled.
2102 */ 2101 */
2103 static bool consume_stock(struct mem_cgroup *mem) 2102 static bool consume_stock(struct mem_cgroup *mem)
2104 { 2103 {
2105 struct memcg_stock_pcp *stock; 2104 struct memcg_stock_pcp *stock;
2106 bool ret = true; 2105 bool ret = true;
2107 2106
2108 stock = &get_cpu_var(memcg_stock); 2107 stock = &get_cpu_var(memcg_stock);
2109 if (mem == stock->cached && stock->nr_pages) 2108 if (mem == stock->cached && stock->nr_pages)
2110 stock->nr_pages--; 2109 stock->nr_pages--;
2111 else /* need to call res_counter_charge */ 2110 else /* need to call res_counter_charge */
2112 ret = false; 2111 ret = false;
2113 put_cpu_var(memcg_stock); 2112 put_cpu_var(memcg_stock);
2114 return ret; 2113 return ret;
2115 } 2114 }
2116 2115
2117 /* 2116 /*
2118 * Returns stocks cached in percpu to res_counter and reset cached information. 2117 * Returns stocks cached in percpu to res_counter and reset cached information.
2119 */ 2118 */
2120 static void drain_stock(struct memcg_stock_pcp *stock) 2119 static void drain_stock(struct memcg_stock_pcp *stock)
2121 { 2120 {
2122 struct mem_cgroup *old = stock->cached; 2121 struct mem_cgroup *old = stock->cached;
2123 2122
2124 if (stock->nr_pages) { 2123 if (stock->nr_pages) {
2125 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2124 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
2126 2125
2127 res_counter_uncharge(&old->res, bytes); 2126 res_counter_uncharge(&old->res, bytes);
2128 if (do_swap_account) 2127 if (do_swap_account)
2129 res_counter_uncharge(&old->memsw, bytes); 2128 res_counter_uncharge(&old->memsw, bytes);
2130 stock->nr_pages = 0; 2129 stock->nr_pages = 0;
2131 } 2130 }
2132 stock->cached = NULL; 2131 stock->cached = NULL;
2133 } 2132 }
2134 2133
2135 /* 2134 /*
2136 * This must be called under preempt disabled or must be called by 2135 * This must be called under preempt disabled or must be called by
2137 * a thread which is pinned to local cpu. 2136 * a thread which is pinned to local cpu.
2138 */ 2137 */
2139 static void drain_local_stock(struct work_struct *dummy) 2138 static void drain_local_stock(struct work_struct *dummy)
2140 { 2139 {
2141 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2140 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
2142 drain_stock(stock); 2141 drain_stock(stock);
2143 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2142 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2144 } 2143 }
2145 2144
2146 /* 2145 /*
2147 * Cache charges(val) which is from res_counter, to local per_cpu area. 2146 * Cache charges(val) which is from res_counter, to local per_cpu area.
2148 * This will be consumed by consume_stock() function, later. 2147 * This will be consumed by consume_stock() function, later.
2149 */ 2148 */
2150 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) 2149 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
2151 { 2150 {
2152 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2151 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
2153 2152
2154 if (stock->cached != mem) { /* reset if necessary */ 2153 if (stock->cached != mem) { /* reset if necessary */
2155 drain_stock(stock); 2154 drain_stock(stock);
2156 stock->cached = mem; 2155 stock->cached = mem;
2157 } 2156 }
2158 stock->nr_pages += nr_pages; 2157 stock->nr_pages += nr_pages;
2159 put_cpu_var(memcg_stock); 2158 put_cpu_var(memcg_stock);
2160 } 2159 }
2161 2160
2162 /* 2161 /*
2163 * Drains all per-CPU charge caches for given root_mem resp. subtree 2162 * Drains all per-CPU charge caches for given root_mem resp. subtree
2164 * of the hierarchy under it. sync flag says whether we should block 2163 * of the hierarchy under it. sync flag says whether we should block
2165 * until the work is done. 2164 * until the work is done.
2166 */ 2165 */
2167 static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) 2166 static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
2168 { 2167 {
2169 int cpu, curcpu; 2168 int cpu, curcpu;
2170 2169
2171 /* Notify other cpus that system-wide "drain" is running */ 2170 /* Notify other cpus that system-wide "drain" is running */
2172 get_online_cpus(); 2171 get_online_cpus();
2173 /* 2172 /*
2174 * Get a hint for avoiding draining charges on the current cpu, 2173 * Get a hint for avoiding draining charges on the current cpu,
2175 * which must be exhausted by our charging. It is not required that 2174 * which must be exhausted by our charging. It is not required that
2176 * this be a precise check, so we use raw_smp_processor_id() instead of 2175 * this be a precise check, so we use raw_smp_processor_id() instead of
2177 * getcpu()/putcpu(). 2176 * getcpu()/putcpu().
2178 */ 2177 */
2179 curcpu = raw_smp_processor_id(); 2178 curcpu = raw_smp_processor_id();
2180 for_each_online_cpu(cpu) { 2179 for_each_online_cpu(cpu) {
2181 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2180 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2182 struct mem_cgroup *mem; 2181 struct mem_cgroup *mem;
2183 2182
2184 mem = stock->cached; 2183 mem = stock->cached;
2185 if (!mem || !stock->nr_pages) 2184 if (!mem || !stock->nr_pages)
2186 continue; 2185 continue;
2187 if (!mem_cgroup_same_or_subtree(root_mem, mem)) 2186 if (!mem_cgroup_same_or_subtree(root_mem, mem))
2188 continue; 2187 continue;
2189 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2188 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2190 if (cpu == curcpu) 2189 if (cpu == curcpu)
2191 drain_local_stock(&stock->work); 2190 drain_local_stock(&stock->work);
2192 else 2191 else
2193 schedule_work_on(cpu, &stock->work); 2192 schedule_work_on(cpu, &stock->work);
2194 } 2193 }
2195 } 2194 }
2196 2195
2197 if (!sync) 2196 if (!sync)
2198 goto out; 2197 goto out;
2199 2198
2200 for_each_online_cpu(cpu) { 2199 for_each_online_cpu(cpu) {
2201 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2200 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2202 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2201 if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
2202 test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
2203 flush_work(&stock->work); 2203 flush_work(&stock->work);
2204 } 2204 }
2205 out: 2205 out:
2206 put_online_cpus(); 2206 put_online_cpus();
2207 } 2207 }
2208 2208
2209 /* 2209 /*
2210 * Tries to drain stocked charges in other cpus. This function is asynchronous 2210 * Tries to drain stocked charges in other cpus. This function is asynchronous
2211 * and just put a work per cpu for draining localy on each cpu. Caller can 2211 * and just put a work per cpu for draining localy on each cpu. Caller can
2212 * expects some charges will be back to res_counter later but cannot wait for 2212 * expects some charges will be back to res_counter later but cannot wait for
2213 * it. 2213 * it.
2214 */ 2214 */
2215 static void drain_all_stock_async(struct mem_cgroup *root_mem) 2215 static void drain_all_stock_async(struct mem_cgroup *root_mem)
2216 { 2216 {
2217 /*
2218 * If someone calls draining, avoid adding more kworker runs.
2219 */
2220 if (!mutex_trylock(&percpu_charge_mutex))
2221 return;
2222 drain_all_stock(root_mem, false); 2217 drain_all_stock(root_mem, false);
2223 mutex_unlock(&percpu_charge_mutex);
2224 } 2218 }
2225 2219
2226 /* This is a synchronous drain interface. */ 2220 /* This is a synchronous drain interface. */
2227 static void drain_all_stock_sync(struct mem_cgroup *root_mem) 2221 static void drain_all_stock_sync(struct mem_cgroup *root_mem)
2228 { 2222 {
2229 /* called when force_empty is called */ 2223 /* called when force_empty is called */
2230 mutex_lock(&percpu_charge_mutex);
2231 drain_all_stock(root_mem, true); 2224 drain_all_stock(root_mem, true);
2232 mutex_unlock(&percpu_charge_mutex);
2233 } 2225 }
2234 2226
2235 /* 2227 /*
2236 * This function drains percpu counter value from DEAD cpu and 2228 * This function drains percpu counter value from DEAD cpu and
2237 * move it to local cpu. Note that this function can be preempted. 2229 * move it to local cpu. Note that this function can be preempted.
2238 */ 2230 */
2239 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 2231 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
2240 { 2232 {
2241 int i; 2233 int i;
2242 2234
2243 spin_lock(&mem->pcp_counter_lock); 2235 spin_lock(&mem->pcp_counter_lock);
2244 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2236 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
2245 long x = per_cpu(mem->stat->count[i], cpu); 2237 long x = per_cpu(mem->stat->count[i], cpu);
2246 2238
2247 per_cpu(mem->stat->count[i], cpu) = 0; 2239 per_cpu(mem->stat->count[i], cpu) = 0;
2248 mem->nocpu_base.count[i] += x; 2240 mem->nocpu_base.count[i] += x;
2249 } 2241 }
2250 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2242 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2251 unsigned long x = per_cpu(mem->stat->events[i], cpu); 2243 unsigned long x = per_cpu(mem->stat->events[i], cpu);
2252 2244
2253 per_cpu(mem->stat->events[i], cpu) = 0; 2245 per_cpu(mem->stat->events[i], cpu) = 0;
2254 mem->nocpu_base.events[i] += x; 2246 mem->nocpu_base.events[i] += x;
2255 } 2247 }
2256 /* need to clear ON_MOVE value, works as a kind of lock. */ 2248 /* need to clear ON_MOVE value, works as a kind of lock. */
2257 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 2249 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
2258 spin_unlock(&mem->pcp_counter_lock); 2250 spin_unlock(&mem->pcp_counter_lock);
2259 } 2251 }
2260 2252
2261 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 2253 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
2262 { 2254 {
2263 int idx = MEM_CGROUP_ON_MOVE; 2255 int idx = MEM_CGROUP_ON_MOVE;
2264 2256
2265 spin_lock(&mem->pcp_counter_lock); 2257 spin_lock(&mem->pcp_counter_lock);
2266 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 2258 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
2267 spin_unlock(&mem->pcp_counter_lock); 2259 spin_unlock(&mem->pcp_counter_lock);
2268 } 2260 }
2269 2261
2270 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 2262 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
2271 unsigned long action, 2263 unsigned long action,
2272 void *hcpu) 2264 void *hcpu)
2273 { 2265 {
2274 int cpu = (unsigned long)hcpu; 2266 int cpu = (unsigned long)hcpu;
2275 struct memcg_stock_pcp *stock; 2267 struct memcg_stock_pcp *stock;
2276 struct mem_cgroup *iter; 2268 struct mem_cgroup *iter;
2277 2269
2278 if ((action == CPU_ONLINE)) { 2270 if ((action == CPU_ONLINE)) {
2279 for_each_mem_cgroup_all(iter) 2271 for_each_mem_cgroup_all(iter)
2280 synchronize_mem_cgroup_on_move(iter, cpu); 2272 synchronize_mem_cgroup_on_move(iter, cpu);
2281 return NOTIFY_OK; 2273 return NOTIFY_OK;
2282 } 2274 }
2283 2275
2284 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 2276 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
2285 return NOTIFY_OK; 2277 return NOTIFY_OK;
2286 2278
2287 for_each_mem_cgroup_all(iter) 2279 for_each_mem_cgroup_all(iter)
2288 mem_cgroup_drain_pcp_counter(iter, cpu); 2280 mem_cgroup_drain_pcp_counter(iter, cpu);
2289 2281
2290 stock = &per_cpu(memcg_stock, cpu); 2282 stock = &per_cpu(memcg_stock, cpu);
2291 drain_stock(stock); 2283 drain_stock(stock);
2292 return NOTIFY_OK; 2284 return NOTIFY_OK;
2293 } 2285 }
2294 2286
2295 2287
2296 /* See __mem_cgroup_try_charge() for details */ 2288 /* See __mem_cgroup_try_charge() for details */
2297 enum { 2289 enum {
2298 CHARGE_OK, /* success */ 2290 CHARGE_OK, /* success */
2299 CHARGE_RETRY, /* need to retry but retry is not bad */ 2291 CHARGE_RETRY, /* need to retry but retry is not bad */
2300 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2292 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
2301 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2293 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
2302 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 2294 CHARGE_OOM_DIE, /* the current is killed because of OOM */
2303 }; 2295 };
2304 2296
2305 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 2297 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
2306 unsigned int nr_pages, bool oom_check) 2298 unsigned int nr_pages, bool oom_check)
2307 { 2299 {
2308 unsigned long csize = nr_pages * PAGE_SIZE; 2300 unsigned long csize = nr_pages * PAGE_SIZE;
2309 struct mem_cgroup *mem_over_limit; 2301 struct mem_cgroup *mem_over_limit;
2310 struct res_counter *fail_res; 2302 struct res_counter *fail_res;
2311 unsigned long flags = 0; 2303 unsigned long flags = 0;
2312 int ret; 2304 int ret;
2313 2305
2314 ret = res_counter_charge(&mem->res, csize, &fail_res); 2306 ret = res_counter_charge(&mem->res, csize, &fail_res);
2315 2307
2316 if (likely(!ret)) { 2308 if (likely(!ret)) {
2317 if (!do_swap_account) 2309 if (!do_swap_account)
2318 return CHARGE_OK; 2310 return CHARGE_OK;
2319 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 2311 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
2320 if (likely(!ret)) 2312 if (likely(!ret))
2321 return CHARGE_OK; 2313 return CHARGE_OK;
2322 2314
2323 res_counter_uncharge(&mem->res, csize); 2315 res_counter_uncharge(&mem->res, csize);
2324 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2316 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
2325 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2317 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
2326 } else 2318 } else
2327 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2319 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
2328 /* 2320 /*
2329 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 2321 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
2330 * of regular pages (CHARGE_BATCH), or a single regular page (1). 2322 * of regular pages (CHARGE_BATCH), or a single regular page (1).
2331 * 2323 *
2332 * Never reclaim on behalf of optional batching, retry with a 2324 * Never reclaim on behalf of optional batching, retry with a
2333 * single page instead. 2325 * single page instead.
2334 */ 2326 */
2335 if (nr_pages == CHARGE_BATCH) 2327 if (nr_pages == CHARGE_BATCH)
2336 return CHARGE_RETRY; 2328 return CHARGE_RETRY;
2337 2329
2338 if (!(gfp_mask & __GFP_WAIT)) 2330 if (!(gfp_mask & __GFP_WAIT))
2339 return CHARGE_WOULDBLOCK; 2331 return CHARGE_WOULDBLOCK;
2340 2332
2341 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 2333 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
2342 gfp_mask, flags, NULL); 2334 gfp_mask, flags, NULL);
2343 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2335 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2344 return CHARGE_RETRY; 2336 return CHARGE_RETRY;
2345 /* 2337 /*
2346 * Even though the limit is exceeded at this point, reclaim 2338 * Even though the limit is exceeded at this point, reclaim
2347 * may have been able to free some pages. Retry the charge 2339 * may have been able to free some pages. Retry the charge
2348 * before killing the task. 2340 * before killing the task.
2349 * 2341 *
2350 * Only for regular pages, though: huge pages are rather 2342 * Only for regular pages, though: huge pages are rather
2351 * unlikely to succeed so close to the limit, and we fall back 2343 * unlikely to succeed so close to the limit, and we fall back
2352 * to regular pages anyway in case of failure. 2344 * to regular pages anyway in case of failure.
2353 */ 2345 */
2354 if (nr_pages == 1 && ret) 2346 if (nr_pages == 1 && ret)
2355 return CHARGE_RETRY; 2347 return CHARGE_RETRY;
2356 2348
2357 /* 2349 /*
2358 * At task move, charge accounts can be doubly counted. So, it's 2350 * At task move, charge accounts can be doubly counted. So, it's
2359 * better to wait until the end of task_move if something is going on. 2351 * better to wait until the end of task_move if something is going on.
2360 */ 2352 */
2361 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2353 if (mem_cgroup_wait_acct_move(mem_over_limit))
2362 return CHARGE_RETRY; 2354 return CHARGE_RETRY;
2363 2355
2364 /* If we don't need to call oom-killer at el, return immediately */ 2356 /* If we don't need to call oom-killer at el, return immediately */
2365 if (!oom_check) 2357 if (!oom_check)
2366 return CHARGE_NOMEM; 2358 return CHARGE_NOMEM;
2367 /* check OOM */ 2359 /* check OOM */
2368 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 2360 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
2369 return CHARGE_OOM_DIE; 2361 return CHARGE_OOM_DIE;
2370 2362
2371 return CHARGE_RETRY; 2363 return CHARGE_RETRY;
2372 } 2364 }
2373 2365
2374 /* 2366 /*
2375 * Unlike exported interface, "oom" parameter is added. if oom==true, 2367 * Unlike exported interface, "oom" parameter is added. if oom==true,
2376 * oom-killer can be invoked. 2368 * oom-killer can be invoked.
2377 */ 2369 */
2378 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2370 static int __mem_cgroup_try_charge(struct mm_struct *mm,
2379 gfp_t gfp_mask, 2371 gfp_t gfp_mask,
2380 unsigned int nr_pages, 2372 unsigned int nr_pages,
2381 struct mem_cgroup **memcg, 2373 struct mem_cgroup **memcg,
2382 bool oom) 2374 bool oom)
2383 { 2375 {
2384 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2376 unsigned int batch = max(CHARGE_BATCH, nr_pages);
2385 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2377 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2386 struct mem_cgroup *mem = NULL; 2378 struct mem_cgroup *mem = NULL;
2387 int ret; 2379 int ret;
2388 2380
2389 /* 2381 /*
2390 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2382 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
2391 * in system level. So, allow to go ahead dying process in addition to 2383 * in system level. So, allow to go ahead dying process in addition to
2392 * MEMDIE process. 2384 * MEMDIE process.
2393 */ 2385 */
2394 if (unlikely(test_thread_flag(TIF_MEMDIE) 2386 if (unlikely(test_thread_flag(TIF_MEMDIE)
2395 || fatal_signal_pending(current))) 2387 || fatal_signal_pending(current)))
2396 goto bypass; 2388 goto bypass;
2397 2389
2398 /* 2390 /*
2399 * We always charge the cgroup the mm_struct belongs to. 2391 * We always charge the cgroup the mm_struct belongs to.
2400 * The mm_struct's mem_cgroup changes on task migration if the 2392 * The mm_struct's mem_cgroup changes on task migration if the
2401 * thread group leader migrates. It's possible that mm is not 2393 * thread group leader migrates. It's possible that mm is not
2402 * set, if so charge the init_mm (happens for pagecache usage). 2394 * set, if so charge the init_mm (happens for pagecache usage).
2403 */ 2395 */
2404 if (!*memcg && !mm) 2396 if (!*memcg && !mm)
2405 goto bypass; 2397 goto bypass;
2406 again: 2398 again:
2407 if (*memcg) { /* css should be a valid one */ 2399 if (*memcg) { /* css should be a valid one */
2408 mem = *memcg; 2400 mem = *memcg;
2409 VM_BUG_ON(css_is_removed(&mem->css)); 2401 VM_BUG_ON(css_is_removed(&mem->css));
2410 if (mem_cgroup_is_root(mem)) 2402 if (mem_cgroup_is_root(mem))
2411 goto done; 2403 goto done;
2412 if (nr_pages == 1 && consume_stock(mem)) 2404 if (nr_pages == 1 && consume_stock(mem))
2413 goto done; 2405 goto done;
2414 css_get(&mem->css); 2406 css_get(&mem->css);
2415 } else { 2407 } else {
2416 struct task_struct *p; 2408 struct task_struct *p;
2417 2409
2418 rcu_read_lock(); 2410 rcu_read_lock();
2419 p = rcu_dereference(mm->owner); 2411 p = rcu_dereference(mm->owner);
2420 /* 2412 /*
2421 * Because we don't have task_lock(), "p" can exit. 2413 * Because we don't have task_lock(), "p" can exit.
2422 * In that case, "mem" can point to root or p can be NULL with 2414 * In that case, "mem" can point to root or p can be NULL with
2423 * race with swapoff. Then, we have small risk of mis-accouning. 2415 * race with swapoff. Then, we have small risk of mis-accouning.
2424 * But such kind of mis-account by race always happens because 2416 * But such kind of mis-account by race always happens because
2425 * we don't have cgroup_mutex(). It's overkill and we allo that 2417 * we don't have cgroup_mutex(). It's overkill and we allo that
2426 * small race, here. 2418 * small race, here.
2427 * (*) swapoff at el will charge against mm-struct not against 2419 * (*) swapoff at el will charge against mm-struct not against
2428 * task-struct. So, mm->owner can be NULL. 2420 * task-struct. So, mm->owner can be NULL.
2429 */ 2421 */
2430 mem = mem_cgroup_from_task(p); 2422 mem = mem_cgroup_from_task(p);
2431 if (!mem || mem_cgroup_is_root(mem)) { 2423 if (!mem || mem_cgroup_is_root(mem)) {
2432 rcu_read_unlock(); 2424 rcu_read_unlock();
2433 goto done; 2425 goto done;
2434 } 2426 }
2435 if (nr_pages == 1 && consume_stock(mem)) { 2427 if (nr_pages == 1 && consume_stock(mem)) {
2436 /* 2428 /*
2437 * It seems dagerous to access memcg without css_get(). 2429 * It seems dagerous to access memcg without css_get().
2438 * But considering how consume_stok works, it's not 2430 * But considering how consume_stok works, it's not
2439 * necessary. If consume_stock success, some charges 2431 * necessary. If consume_stock success, some charges
2440 * from this memcg are cached on this cpu. So, we 2432 * from this memcg are cached on this cpu. So, we
2441 * don't need to call css_get()/css_tryget() before 2433 * don't need to call css_get()/css_tryget() before
2442 * calling consume_stock(). 2434 * calling consume_stock().
2443 */ 2435 */
2444 rcu_read_unlock(); 2436 rcu_read_unlock();
2445 goto done; 2437 goto done;
2446 } 2438 }
2447 /* after here, we may be blocked. we need to get refcnt */ 2439 /* after here, we may be blocked. we need to get refcnt */
2448 if (!css_tryget(&mem->css)) { 2440 if (!css_tryget(&mem->css)) {
2449 rcu_read_unlock(); 2441 rcu_read_unlock();
2450 goto again; 2442 goto again;
2451 } 2443 }
2452 rcu_read_unlock(); 2444 rcu_read_unlock();
2453 } 2445 }
2454 2446
2455 do { 2447 do {
2456 bool oom_check; 2448 bool oom_check;
2457 2449
2458 /* If killed, bypass charge */ 2450 /* If killed, bypass charge */
2459 if (fatal_signal_pending(current)) { 2451 if (fatal_signal_pending(current)) {
2460 css_put(&mem->css); 2452 css_put(&mem->css);
2461 goto bypass; 2453 goto bypass;
2462 } 2454 }
2463 2455
2464 oom_check = false; 2456 oom_check = false;
2465 if (oom && !nr_oom_retries) { 2457 if (oom && !nr_oom_retries) {
2466 oom_check = true; 2458 oom_check = true;
2467 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2459 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2468 } 2460 }
2469 2461
2470 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); 2462 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
2471 switch (ret) { 2463 switch (ret) {
2472 case CHARGE_OK: 2464 case CHARGE_OK:
2473 break; 2465 break;
2474 case CHARGE_RETRY: /* not in OOM situation but retry */ 2466 case CHARGE_RETRY: /* not in OOM situation but retry */
2475 batch = nr_pages; 2467 batch = nr_pages;
2476 css_put(&mem->css); 2468 css_put(&mem->css);
2477 mem = NULL; 2469 mem = NULL;
2478 goto again; 2470 goto again;
2479 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2471 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2480 css_put(&mem->css); 2472 css_put(&mem->css);
2481 goto nomem; 2473 goto nomem;
2482 case CHARGE_NOMEM: /* OOM routine works */ 2474 case CHARGE_NOMEM: /* OOM routine works */
2483 if (!oom) { 2475 if (!oom) {
2484 css_put(&mem->css); 2476 css_put(&mem->css);
2485 goto nomem; 2477 goto nomem;
2486 } 2478 }
2487 /* If oom, we never return -ENOMEM */ 2479 /* If oom, we never return -ENOMEM */
2488 nr_oom_retries--; 2480 nr_oom_retries--;
2489 break; 2481 break;
2490 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2482 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2491 css_put(&mem->css); 2483 css_put(&mem->css);
2492 goto bypass; 2484 goto bypass;
2493 } 2485 }
2494 } while (ret != CHARGE_OK); 2486 } while (ret != CHARGE_OK);
2495 2487
2496 if (batch > nr_pages) 2488 if (batch > nr_pages)
2497 refill_stock(mem, batch - nr_pages); 2489 refill_stock(mem, batch - nr_pages);
2498 css_put(&mem->css); 2490 css_put(&mem->css);
2499 done: 2491 done:
2500 *memcg = mem; 2492 *memcg = mem;
2501 return 0; 2493 return 0;
2502 nomem: 2494 nomem:
2503 *memcg = NULL; 2495 *memcg = NULL;
2504 return -ENOMEM; 2496 return -ENOMEM;
2505 bypass: 2497 bypass:
2506 *memcg = NULL; 2498 *memcg = NULL;
2507 return 0; 2499 return 0;
2508 } 2500 }
2509 2501
2510 /* 2502 /*
2511 * Somemtimes we have to undo a charge we got by try_charge(). 2503 * Somemtimes we have to undo a charge we got by try_charge().
2512 * This function is for that and do uncharge, put css's refcnt. 2504 * This function is for that and do uncharge, put css's refcnt.
2513 * gotten by try_charge(). 2505 * gotten by try_charge().
2514 */ 2506 */
2515 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2507 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2516 unsigned int nr_pages) 2508 unsigned int nr_pages)
2517 { 2509 {
2518 if (!mem_cgroup_is_root(mem)) { 2510 if (!mem_cgroup_is_root(mem)) {
2519 unsigned long bytes = nr_pages * PAGE_SIZE; 2511 unsigned long bytes = nr_pages * PAGE_SIZE;
2520 2512
2521 res_counter_uncharge(&mem->res, bytes); 2513 res_counter_uncharge(&mem->res, bytes);
2522 if (do_swap_account) 2514 if (do_swap_account)
2523 res_counter_uncharge(&mem->memsw, bytes); 2515 res_counter_uncharge(&mem->memsw, bytes);
2524 } 2516 }
2525 } 2517 }
2526 2518
2527 /* 2519 /*
2528 * A helper function to get mem_cgroup from ID. must be called under 2520 * A helper function to get mem_cgroup from ID. must be called under
2529 * rcu_read_lock(). The caller must check css_is_removed() or some if 2521 * rcu_read_lock(). The caller must check css_is_removed() or some if
2530 * it's concern. (dropping refcnt from swap can be called against removed 2522 * it's concern. (dropping refcnt from swap can be called against removed
2531 * memcg.) 2523 * memcg.)
2532 */ 2524 */
2533 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2525 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2534 { 2526 {
2535 struct cgroup_subsys_state *css; 2527 struct cgroup_subsys_state *css;
2536 2528
2537 /* ID 0 is unused ID */ 2529 /* ID 0 is unused ID */
2538 if (!id) 2530 if (!id)
2539 return NULL; 2531 return NULL;
2540 css = css_lookup(&mem_cgroup_subsys, id); 2532 css = css_lookup(&mem_cgroup_subsys, id);
2541 if (!css) 2533 if (!css)
2542 return NULL; 2534 return NULL;
2543 return container_of(css, struct mem_cgroup, css); 2535 return container_of(css, struct mem_cgroup, css);
2544 } 2536 }
2545 2537
2546 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2538 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2547 { 2539 {
2548 struct mem_cgroup *mem = NULL; 2540 struct mem_cgroup *mem = NULL;
2549 struct page_cgroup *pc; 2541 struct page_cgroup *pc;
2550 unsigned short id; 2542 unsigned short id;
2551 swp_entry_t ent; 2543 swp_entry_t ent;
2552 2544
2553 VM_BUG_ON(!PageLocked(page)); 2545 VM_BUG_ON(!PageLocked(page));
2554 2546
2555 pc = lookup_page_cgroup(page); 2547 pc = lookup_page_cgroup(page);
2556 lock_page_cgroup(pc); 2548 lock_page_cgroup(pc);
2557 if (PageCgroupUsed(pc)) { 2549 if (PageCgroupUsed(pc)) {
2558 mem = pc->mem_cgroup; 2550 mem = pc->mem_cgroup;
2559 if (mem && !css_tryget(&mem->css)) 2551 if (mem && !css_tryget(&mem->css))
2560 mem = NULL; 2552 mem = NULL;
2561 } else if (PageSwapCache(page)) { 2553 } else if (PageSwapCache(page)) {
2562 ent.val = page_private(page); 2554 ent.val = page_private(page);
2563 id = lookup_swap_cgroup(ent); 2555 id = lookup_swap_cgroup(ent);
2564 rcu_read_lock(); 2556 rcu_read_lock();
2565 mem = mem_cgroup_lookup(id); 2557 mem = mem_cgroup_lookup(id);
2566 if (mem && !css_tryget(&mem->css)) 2558 if (mem && !css_tryget(&mem->css))
2567 mem = NULL; 2559 mem = NULL;
2568 rcu_read_unlock(); 2560 rcu_read_unlock();
2569 } 2561 }
2570 unlock_page_cgroup(pc); 2562 unlock_page_cgroup(pc);
2571 return mem; 2563 return mem;
2572 } 2564 }
2573 2565
2574 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2566 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2575 struct page *page, 2567 struct page *page,
2576 unsigned int nr_pages, 2568 unsigned int nr_pages,
2577 struct page_cgroup *pc, 2569 struct page_cgroup *pc,
2578 enum charge_type ctype) 2570 enum charge_type ctype)
2579 { 2571 {
2580 lock_page_cgroup(pc); 2572 lock_page_cgroup(pc);
2581 if (unlikely(PageCgroupUsed(pc))) { 2573 if (unlikely(PageCgroupUsed(pc))) {
2582 unlock_page_cgroup(pc); 2574 unlock_page_cgroup(pc);
2583 __mem_cgroup_cancel_charge(mem, nr_pages); 2575 __mem_cgroup_cancel_charge(mem, nr_pages);
2584 return; 2576 return;
2585 } 2577 }
2586 /* 2578 /*
2587 * we don't need page_cgroup_lock about tail pages, becase they are not 2579 * we don't need page_cgroup_lock about tail pages, becase they are not
2588 * accessed by any other context at this point. 2580 * accessed by any other context at this point.
2589 */ 2581 */
2590 pc->mem_cgroup = mem; 2582 pc->mem_cgroup = mem;
2591 /* 2583 /*
2592 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2584 * We access a page_cgroup asynchronously without lock_page_cgroup().
2593 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2585 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2594 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2586 * is accessed after testing USED bit. To make pc->mem_cgroup visible
2595 * before USED bit, we need memory barrier here. 2587 * before USED bit, we need memory barrier here.
2596 * See mem_cgroup_add_lru_list(), etc. 2588 * See mem_cgroup_add_lru_list(), etc.
2597 */ 2589 */
2598 smp_wmb(); 2590 smp_wmb();
2599 switch (ctype) { 2591 switch (ctype) {
2600 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2592 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2601 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2593 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2602 SetPageCgroupCache(pc); 2594 SetPageCgroupCache(pc);
2603 SetPageCgroupUsed(pc); 2595 SetPageCgroupUsed(pc);
2604 break; 2596 break;
2605 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2597 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2606 ClearPageCgroupCache(pc); 2598 ClearPageCgroupCache(pc);
2607 SetPageCgroupUsed(pc); 2599 SetPageCgroupUsed(pc);
2608 break; 2600 break;
2609 default: 2601 default:
2610 break; 2602 break;
2611 } 2603 }
2612 2604
2613 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2605 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2614 unlock_page_cgroup(pc); 2606 unlock_page_cgroup(pc);
2615 /* 2607 /*
2616 * "charge_statistics" updated event counter. Then, check it. 2608 * "charge_statistics" updated event counter. Then, check it.
2617 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2609 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2618 * if they exceeds softlimit. 2610 * if they exceeds softlimit.
2619 */ 2611 */
2620 memcg_check_events(mem, page); 2612 memcg_check_events(mem, page);
2621 } 2613 }
2622 2614
2623 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2615 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2624 2616
2625 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2617 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2626 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2618 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2627 /* 2619 /*
2628 * Because tail pages are not marked as "used", set it. We're under 2620 * Because tail pages are not marked as "used", set it. We're under
2629 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2621 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2630 */ 2622 */
2631 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2623 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2632 { 2624 {
2633 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2625 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2634 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2626 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2635 unsigned long flags; 2627 unsigned long flags;
2636 2628
2637 if (mem_cgroup_disabled()) 2629 if (mem_cgroup_disabled())
2638 return; 2630 return;
2639 /* 2631 /*
2640 * We have no races with charge/uncharge but will have races with 2632 * We have no races with charge/uncharge but will have races with
2641 * page state accounting. 2633 * page state accounting.
2642 */ 2634 */
2643 move_lock_page_cgroup(head_pc, &flags); 2635 move_lock_page_cgroup(head_pc, &flags);
2644 2636
2645 tail_pc->mem_cgroup = head_pc->mem_cgroup; 2637 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2646 smp_wmb(); /* see __commit_charge() */ 2638 smp_wmb(); /* see __commit_charge() */
2647 if (PageCgroupAcctLRU(head_pc)) { 2639 if (PageCgroupAcctLRU(head_pc)) {
2648 enum lru_list lru; 2640 enum lru_list lru;
2649 struct mem_cgroup_per_zone *mz; 2641 struct mem_cgroup_per_zone *mz;
2650 2642
2651 /* 2643 /*
2652 * LRU flags cannot be copied because we need to add tail 2644 * LRU flags cannot be copied because we need to add tail
2653 *.page to LRU by generic call and our hook will be called. 2645 *.page to LRU by generic call and our hook will be called.
2654 * We hold lru_lock, then, reduce counter directly. 2646 * We hold lru_lock, then, reduce counter directly.
2655 */ 2647 */
2656 lru = page_lru(head); 2648 lru = page_lru(head);
2657 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); 2649 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2658 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2650 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2659 } 2651 }
2660 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2652 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2661 move_unlock_page_cgroup(head_pc, &flags); 2653 move_unlock_page_cgroup(head_pc, &flags);
2662 } 2654 }
2663 #endif 2655 #endif
2664 2656
2665 /** 2657 /**
2666 * mem_cgroup_move_account - move account of the page 2658 * mem_cgroup_move_account - move account of the page
2667 * @page: the page 2659 * @page: the page
2668 * @nr_pages: number of regular pages (>1 for huge pages) 2660 * @nr_pages: number of regular pages (>1 for huge pages)
2669 * @pc: page_cgroup of the page. 2661 * @pc: page_cgroup of the page.
2670 * @from: mem_cgroup which the page is moved from. 2662 * @from: mem_cgroup which the page is moved from.
2671 * @to: mem_cgroup which the page is moved to. @from != @to. 2663 * @to: mem_cgroup which the page is moved to. @from != @to.
2672 * @uncharge: whether we should call uncharge and css_put against @from. 2664 * @uncharge: whether we should call uncharge and css_put against @from.
2673 * 2665 *
2674 * The caller must confirm following. 2666 * The caller must confirm following.
2675 * - page is not on LRU (isolate_page() is useful.) 2667 * - page is not on LRU (isolate_page() is useful.)
2676 * - compound_lock is held when nr_pages > 1 2668 * - compound_lock is held when nr_pages > 1
2677 * 2669 *
2678 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2670 * This function doesn't do "charge" nor css_get to new cgroup. It should be
2679 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2671 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2680 * true, this function does "uncharge" from old cgroup, but it doesn't if 2672 * true, this function does "uncharge" from old cgroup, but it doesn't if
2681 * @uncharge is false, so a caller should do "uncharge". 2673 * @uncharge is false, so a caller should do "uncharge".
2682 */ 2674 */
2683 static int mem_cgroup_move_account(struct page *page, 2675 static int mem_cgroup_move_account(struct page *page,
2684 unsigned int nr_pages, 2676 unsigned int nr_pages,
2685 struct page_cgroup *pc, 2677 struct page_cgroup *pc,
2686 struct mem_cgroup *from, 2678 struct mem_cgroup *from,
2687 struct mem_cgroup *to, 2679 struct mem_cgroup *to,
2688 bool uncharge) 2680 bool uncharge)
2689 { 2681 {
2690 unsigned long flags; 2682 unsigned long flags;
2691 int ret; 2683 int ret;
2692 2684
2693 VM_BUG_ON(from == to); 2685 VM_BUG_ON(from == to);
2694 VM_BUG_ON(PageLRU(page)); 2686 VM_BUG_ON(PageLRU(page));
2695 /* 2687 /*
2696 * The page is isolated from LRU. So, collapse function 2688 * The page is isolated from LRU. So, collapse function
2697 * will not handle this page. But page splitting can happen. 2689 * will not handle this page. But page splitting can happen.
2698 * Do this check under compound_page_lock(). The caller should 2690 * Do this check under compound_page_lock(). The caller should
2699 * hold it. 2691 * hold it.
2700 */ 2692 */
2701 ret = -EBUSY; 2693 ret = -EBUSY;
2702 if (nr_pages > 1 && !PageTransHuge(page)) 2694 if (nr_pages > 1 && !PageTransHuge(page))
2703 goto out; 2695 goto out;
2704 2696
2705 lock_page_cgroup(pc); 2697 lock_page_cgroup(pc);
2706 2698
2707 ret = -EINVAL; 2699 ret = -EINVAL;
2708 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2700 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2709 goto unlock; 2701 goto unlock;
2710 2702
2711 move_lock_page_cgroup(pc, &flags); 2703 move_lock_page_cgroup(pc, &flags);
2712 2704
2713 if (PageCgroupFileMapped(pc)) { 2705 if (PageCgroupFileMapped(pc)) {
2714 /* Update mapped_file data for mem_cgroup */ 2706 /* Update mapped_file data for mem_cgroup */
2715 preempt_disable(); 2707 preempt_disable();
2716 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2708 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2717 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2709 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2718 preempt_enable(); 2710 preempt_enable();
2719 } 2711 }
2720 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2712 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2721 if (uncharge) 2713 if (uncharge)
2722 /* This is not "cancel", but cancel_charge does all we need. */ 2714 /* This is not "cancel", but cancel_charge does all we need. */
2723 __mem_cgroup_cancel_charge(from, nr_pages); 2715 __mem_cgroup_cancel_charge(from, nr_pages);
2724 2716
2725 /* caller should have done css_get */ 2717 /* caller should have done css_get */
2726 pc->mem_cgroup = to; 2718 pc->mem_cgroup = to;
2727 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2719 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2728 /* 2720 /*
2729 * We charges against "to" which may not have any tasks. Then, "to" 2721 * We charges against "to" which may not have any tasks. Then, "to"
2730 * can be under rmdir(). But in current implementation, caller of 2722 * can be under rmdir(). But in current implementation, caller of
2731 * this function is just force_empty() and move charge, so it's 2723 * this function is just force_empty() and move charge, so it's
2732 * guaranteed that "to" is never removed. So, we don't check rmdir 2724 * guaranteed that "to" is never removed. So, we don't check rmdir
2733 * status here. 2725 * status here.
2734 */ 2726 */
2735 move_unlock_page_cgroup(pc, &flags); 2727 move_unlock_page_cgroup(pc, &flags);
2736 ret = 0; 2728 ret = 0;
2737 unlock: 2729 unlock:
2738 unlock_page_cgroup(pc); 2730 unlock_page_cgroup(pc);
2739 /* 2731 /*
2740 * check events 2732 * check events
2741 */ 2733 */
2742 memcg_check_events(to, page); 2734 memcg_check_events(to, page);
2743 memcg_check_events(from, page); 2735 memcg_check_events(from, page);
2744 out: 2736 out:
2745 return ret; 2737 return ret;
2746 } 2738 }
2747 2739
2748 /* 2740 /*
2749 * move charges to its parent. 2741 * move charges to its parent.
2750 */ 2742 */
2751 2743
2752 static int mem_cgroup_move_parent(struct page *page, 2744 static int mem_cgroup_move_parent(struct page *page,
2753 struct page_cgroup *pc, 2745 struct page_cgroup *pc,
2754 struct mem_cgroup *child, 2746 struct mem_cgroup *child,
2755 gfp_t gfp_mask) 2747 gfp_t gfp_mask)
2756 { 2748 {
2757 struct cgroup *cg = child->css.cgroup; 2749 struct cgroup *cg = child->css.cgroup;
2758 struct cgroup *pcg = cg->parent; 2750 struct cgroup *pcg = cg->parent;
2759 struct mem_cgroup *parent; 2751 struct mem_cgroup *parent;
2760 unsigned int nr_pages; 2752 unsigned int nr_pages;
2761 unsigned long uninitialized_var(flags); 2753 unsigned long uninitialized_var(flags);
2762 int ret; 2754 int ret;
2763 2755
2764 /* Is ROOT ? */ 2756 /* Is ROOT ? */
2765 if (!pcg) 2757 if (!pcg)
2766 return -EINVAL; 2758 return -EINVAL;
2767 2759
2768 ret = -EBUSY; 2760 ret = -EBUSY;
2769 if (!get_page_unless_zero(page)) 2761 if (!get_page_unless_zero(page))
2770 goto out; 2762 goto out;
2771 if (isolate_lru_page(page)) 2763 if (isolate_lru_page(page))
2772 goto put; 2764 goto put;
2773 2765
2774 nr_pages = hpage_nr_pages(page); 2766 nr_pages = hpage_nr_pages(page);
2775 2767
2776 parent = mem_cgroup_from_cont(pcg); 2768 parent = mem_cgroup_from_cont(pcg);
2777 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2769 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2778 if (ret || !parent) 2770 if (ret || !parent)
2779 goto put_back; 2771 goto put_back;
2780 2772
2781 if (nr_pages > 1) 2773 if (nr_pages > 1)
2782 flags = compound_lock_irqsave(page); 2774 flags = compound_lock_irqsave(page);
2783 2775
2784 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2776 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2785 if (ret) 2777 if (ret)
2786 __mem_cgroup_cancel_charge(parent, nr_pages); 2778 __mem_cgroup_cancel_charge(parent, nr_pages);
2787 2779
2788 if (nr_pages > 1) 2780 if (nr_pages > 1)
2789 compound_unlock_irqrestore(page, flags); 2781 compound_unlock_irqrestore(page, flags);
2790 put_back: 2782 put_back:
2791 putback_lru_page(page); 2783 putback_lru_page(page);
2792 put: 2784 put:
2793 put_page(page); 2785 put_page(page);
2794 out: 2786 out:
2795 return ret; 2787 return ret;
2796 } 2788 }
2797 2789
2798 /* 2790 /*
2799 * Charge the memory controller for page usage. 2791 * Charge the memory controller for page usage.
2800 * Return 2792 * Return
2801 * 0 if the charge was successful 2793 * 0 if the charge was successful
2802 * < 0 if the cgroup is over its limit 2794 * < 0 if the cgroup is over its limit
2803 */ 2795 */
2804 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2796 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2805 gfp_t gfp_mask, enum charge_type ctype) 2797 gfp_t gfp_mask, enum charge_type ctype)
2806 { 2798 {
2807 struct mem_cgroup *mem = NULL; 2799 struct mem_cgroup *mem = NULL;
2808 unsigned int nr_pages = 1; 2800 unsigned int nr_pages = 1;
2809 struct page_cgroup *pc; 2801 struct page_cgroup *pc;
2810 bool oom = true; 2802 bool oom = true;
2811 int ret; 2803 int ret;
2812 2804
2813 if (PageTransHuge(page)) { 2805 if (PageTransHuge(page)) {
2814 nr_pages <<= compound_order(page); 2806 nr_pages <<= compound_order(page);
2815 VM_BUG_ON(!PageTransHuge(page)); 2807 VM_BUG_ON(!PageTransHuge(page));
2816 /* 2808 /*
2817 * Never OOM-kill a process for a huge page. The 2809 * Never OOM-kill a process for a huge page. The
2818 * fault handler will fall back to regular pages. 2810 * fault handler will fall back to regular pages.
2819 */ 2811 */
2820 oom = false; 2812 oom = false;
2821 } 2813 }
2822 2814
2823 pc = lookup_page_cgroup(page); 2815 pc = lookup_page_cgroup(page);
2824 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ 2816 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2825 2817
2826 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); 2818 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2827 if (ret || !mem) 2819 if (ret || !mem)
2828 return ret; 2820 return ret;
2829 2821
2830 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); 2822 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2831 return 0; 2823 return 0;
2832 } 2824 }
2833 2825
2834 int mem_cgroup_newpage_charge(struct page *page, 2826 int mem_cgroup_newpage_charge(struct page *page,
2835 struct mm_struct *mm, gfp_t gfp_mask) 2827 struct mm_struct *mm, gfp_t gfp_mask)
2836 { 2828 {
2837 if (mem_cgroup_disabled()) 2829 if (mem_cgroup_disabled())
2838 return 0; 2830 return 0;
2839 /* 2831 /*
2840 * If already mapped, we don't have to account. 2832 * If already mapped, we don't have to account.
2841 * If page cache, page->mapping has address_space. 2833 * If page cache, page->mapping has address_space.
2842 * But page->mapping may have out-of-use anon_vma pointer, 2834 * But page->mapping may have out-of-use anon_vma pointer,
2843 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2835 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2844 * is NULL. 2836 * is NULL.
2845 */ 2837 */
2846 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2838 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2847 return 0; 2839 return 0;
2848 if (unlikely(!mm)) 2840 if (unlikely(!mm))
2849 mm = &init_mm; 2841 mm = &init_mm;
2850 return mem_cgroup_charge_common(page, mm, gfp_mask, 2842 return mem_cgroup_charge_common(page, mm, gfp_mask,
2851 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2843 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2852 } 2844 }
2853 2845
2854 static void 2846 static void
2855 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2847 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2856 enum charge_type ctype); 2848 enum charge_type ctype);
2857 2849
2858 static void 2850 static void
2859 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, 2851 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2860 enum charge_type ctype) 2852 enum charge_type ctype)
2861 { 2853 {
2862 struct page_cgroup *pc = lookup_page_cgroup(page); 2854 struct page_cgroup *pc = lookup_page_cgroup(page);
2863 /* 2855 /*
2864 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2856 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2865 * is already on LRU. It means the page may on some other page_cgroup's 2857 * is already on LRU. It means the page may on some other page_cgroup's
2866 * LRU. Take care of it. 2858 * LRU. Take care of it.
2867 */ 2859 */
2868 mem_cgroup_lru_del_before_commit(page); 2860 mem_cgroup_lru_del_before_commit(page);
2869 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 2861 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2870 mem_cgroup_lru_add_after_commit(page); 2862 mem_cgroup_lru_add_after_commit(page);
2871 return; 2863 return;
2872 } 2864 }
2873 2865
2874 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2866 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2875 gfp_t gfp_mask) 2867 gfp_t gfp_mask)
2876 { 2868 {
2877 struct mem_cgroup *mem = NULL; 2869 struct mem_cgroup *mem = NULL;
2878 int ret; 2870 int ret;
2879 2871
2880 if (mem_cgroup_disabled()) 2872 if (mem_cgroup_disabled())
2881 return 0; 2873 return 0;
2882 if (PageCompound(page)) 2874 if (PageCompound(page))
2883 return 0; 2875 return 0;
2884 /* 2876 /*
2885 * Corner case handling. This is called from add_to_page_cache() 2877 * Corner case handling. This is called from add_to_page_cache()
2886 * in usual. But some FS (shmem) precharges this page before calling it 2878 * in usual. But some FS (shmem) precharges this page before calling it
2887 * and call add_to_page_cache() with GFP_NOWAIT. 2879 * and call add_to_page_cache() with GFP_NOWAIT.
2888 * 2880 *
2889 * For GFP_NOWAIT case, the page may be pre-charged before calling 2881 * For GFP_NOWAIT case, the page may be pre-charged before calling
2890 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 2882 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2891 * charge twice. (It works but has to pay a bit larger cost.) 2883 * charge twice. (It works but has to pay a bit larger cost.)
2892 * And when the page is SwapCache, it should take swap information 2884 * And when the page is SwapCache, it should take swap information
2893 * into account. This is under lock_page() now. 2885 * into account. This is under lock_page() now.
2894 */ 2886 */
2895 if (!(gfp_mask & __GFP_WAIT)) { 2887 if (!(gfp_mask & __GFP_WAIT)) {
2896 struct page_cgroup *pc; 2888 struct page_cgroup *pc;
2897 2889
2898 pc = lookup_page_cgroup(page); 2890 pc = lookup_page_cgroup(page);
2899 if (!pc) 2891 if (!pc)
2900 return 0; 2892 return 0;
2901 lock_page_cgroup(pc); 2893 lock_page_cgroup(pc);
2902 if (PageCgroupUsed(pc)) { 2894 if (PageCgroupUsed(pc)) {
2903 unlock_page_cgroup(pc); 2895 unlock_page_cgroup(pc);
2904 return 0; 2896 return 0;
2905 } 2897 }
2906 unlock_page_cgroup(pc); 2898 unlock_page_cgroup(pc);
2907 } 2899 }
2908 2900
2909 if (unlikely(!mm)) 2901 if (unlikely(!mm))
2910 mm = &init_mm; 2902 mm = &init_mm;
2911 2903
2912 if (page_is_file_cache(page)) { 2904 if (page_is_file_cache(page)) {
2913 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); 2905 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2914 if (ret || !mem) 2906 if (ret || !mem)
2915 return ret; 2907 return ret;
2916 2908
2917 /* 2909 /*
2918 * FUSE reuses pages without going through the final 2910 * FUSE reuses pages without going through the final
2919 * put that would remove them from the LRU list, make 2911 * put that would remove them from the LRU list, make
2920 * sure that they get relinked properly. 2912 * sure that they get relinked properly.
2921 */ 2913 */
2922 __mem_cgroup_commit_charge_lrucare(page, mem, 2914 __mem_cgroup_commit_charge_lrucare(page, mem,
2923 MEM_CGROUP_CHARGE_TYPE_CACHE); 2915 MEM_CGROUP_CHARGE_TYPE_CACHE);
2924 return ret; 2916 return ret;
2925 } 2917 }
2926 /* shmem */ 2918 /* shmem */
2927 if (PageSwapCache(page)) { 2919 if (PageSwapCache(page)) {
2928 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2920 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2929 if (!ret) 2921 if (!ret)
2930 __mem_cgroup_commit_charge_swapin(page, mem, 2922 __mem_cgroup_commit_charge_swapin(page, mem,
2931 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2923 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2932 } else 2924 } else
2933 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2925 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2934 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2926 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2935 2927
2936 return ret; 2928 return ret;
2937 } 2929 }
2938 2930
2939 /* 2931 /*
2940 * While swap-in, try_charge -> commit or cancel, the page is locked. 2932 * While swap-in, try_charge -> commit or cancel, the page is locked.
2941 * And when try_charge() successfully returns, one refcnt to memcg without 2933 * And when try_charge() successfully returns, one refcnt to memcg without
2942 * struct page_cgroup is acquired. This refcnt will be consumed by 2934 * struct page_cgroup is acquired. This refcnt will be consumed by
2943 * "commit()" or removed by "cancel()" 2935 * "commit()" or removed by "cancel()"
2944 */ 2936 */
2945 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2937 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2946 struct page *page, 2938 struct page *page,
2947 gfp_t mask, struct mem_cgroup **ptr) 2939 gfp_t mask, struct mem_cgroup **ptr)
2948 { 2940 {
2949 struct mem_cgroup *mem; 2941 struct mem_cgroup *mem;
2950 int ret; 2942 int ret;
2951 2943
2952 *ptr = NULL; 2944 *ptr = NULL;
2953 2945
2954 if (mem_cgroup_disabled()) 2946 if (mem_cgroup_disabled())
2955 return 0; 2947 return 0;
2956 2948
2957 if (!do_swap_account) 2949 if (!do_swap_account)
2958 goto charge_cur_mm; 2950 goto charge_cur_mm;
2959 /* 2951 /*
2960 * A racing thread's fault, or swapoff, may have already updated 2952 * A racing thread's fault, or swapoff, may have already updated
2961 * the pte, and even removed page from swap cache: in those cases 2953 * the pte, and even removed page from swap cache: in those cases
2962 * do_swap_page()'s pte_same() test will fail; but there's also a 2954 * do_swap_page()'s pte_same() test will fail; but there's also a
2963 * KSM case which does need to charge the page. 2955 * KSM case which does need to charge the page.
2964 */ 2956 */
2965 if (!PageSwapCache(page)) 2957 if (!PageSwapCache(page))
2966 goto charge_cur_mm; 2958 goto charge_cur_mm;
2967 mem = try_get_mem_cgroup_from_page(page); 2959 mem = try_get_mem_cgroup_from_page(page);
2968 if (!mem) 2960 if (!mem)
2969 goto charge_cur_mm; 2961 goto charge_cur_mm;
2970 *ptr = mem; 2962 *ptr = mem;
2971 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2963 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2972 css_put(&mem->css); 2964 css_put(&mem->css);
2973 return ret; 2965 return ret;
2974 charge_cur_mm: 2966 charge_cur_mm:
2975 if (unlikely(!mm)) 2967 if (unlikely(!mm))
2976 mm = &init_mm; 2968 mm = &init_mm;
2977 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2969 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2978 } 2970 }
2979 2971
2980 static void 2972 static void
2981 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2973 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2982 enum charge_type ctype) 2974 enum charge_type ctype)
2983 { 2975 {
2984 if (mem_cgroup_disabled()) 2976 if (mem_cgroup_disabled())
2985 return; 2977 return;
2986 if (!ptr) 2978 if (!ptr)
2987 return; 2979 return;
2988 cgroup_exclude_rmdir(&ptr->css); 2980 cgroup_exclude_rmdir(&ptr->css);
2989 2981
2990 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2982 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2991 /* 2983 /*
2992 * Now swap is on-memory. This means this page may be 2984 * Now swap is on-memory. This means this page may be
2993 * counted both as mem and swap....double count. 2985 * counted both as mem and swap....double count.
2994 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2986 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2995 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2987 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2996 * may call delete_from_swap_cache() before reach here. 2988 * may call delete_from_swap_cache() before reach here.
2997 */ 2989 */
2998 if (do_swap_account && PageSwapCache(page)) { 2990 if (do_swap_account && PageSwapCache(page)) {
2999 swp_entry_t ent = {.val = page_private(page)}; 2991 swp_entry_t ent = {.val = page_private(page)};
3000 unsigned short id; 2992 unsigned short id;
3001 struct mem_cgroup *memcg; 2993 struct mem_cgroup *memcg;
3002 2994
3003 id = swap_cgroup_record(ent, 0); 2995 id = swap_cgroup_record(ent, 0);
3004 rcu_read_lock(); 2996 rcu_read_lock();
3005 memcg = mem_cgroup_lookup(id); 2997 memcg = mem_cgroup_lookup(id);
3006 if (memcg) { 2998 if (memcg) {
3007 /* 2999 /*
3008 * This recorded memcg can be obsolete one. So, avoid 3000 * This recorded memcg can be obsolete one. So, avoid
3009 * calling css_tryget 3001 * calling css_tryget
3010 */ 3002 */
3011 if (!mem_cgroup_is_root(memcg)) 3003 if (!mem_cgroup_is_root(memcg))
3012 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3004 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3013 mem_cgroup_swap_statistics(memcg, false); 3005 mem_cgroup_swap_statistics(memcg, false);
3014 mem_cgroup_put(memcg); 3006 mem_cgroup_put(memcg);
3015 } 3007 }
3016 rcu_read_unlock(); 3008 rcu_read_unlock();
3017 } 3009 }
3018 /* 3010 /*
3019 * At swapin, we may charge account against cgroup which has no tasks. 3011 * At swapin, we may charge account against cgroup which has no tasks.
3020 * So, rmdir()->pre_destroy() can be called while we do this charge. 3012 * So, rmdir()->pre_destroy() can be called while we do this charge.
3021 * In that case, we need to call pre_destroy() again. check it here. 3013 * In that case, we need to call pre_destroy() again. check it here.
3022 */ 3014 */
3023 cgroup_release_and_wakeup_rmdir(&ptr->css); 3015 cgroup_release_and_wakeup_rmdir(&ptr->css);
3024 } 3016 }
3025 3017
3026 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 3018 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
3027 { 3019 {
3028 __mem_cgroup_commit_charge_swapin(page, ptr, 3020 __mem_cgroup_commit_charge_swapin(page, ptr,
3029 MEM_CGROUP_CHARGE_TYPE_MAPPED); 3021 MEM_CGROUP_CHARGE_TYPE_MAPPED);
3030 } 3022 }
3031 3023
3032 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 3024 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
3033 { 3025 {
3034 if (mem_cgroup_disabled()) 3026 if (mem_cgroup_disabled())
3035 return; 3027 return;
3036 if (!mem) 3028 if (!mem)
3037 return; 3029 return;
3038 __mem_cgroup_cancel_charge(mem, 1); 3030 __mem_cgroup_cancel_charge(mem, 1);
3039 } 3031 }
3040 3032
3041 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, 3033 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
3042 unsigned int nr_pages, 3034 unsigned int nr_pages,
3043 const enum charge_type ctype) 3035 const enum charge_type ctype)
3044 { 3036 {
3045 struct memcg_batch_info *batch = NULL; 3037 struct memcg_batch_info *batch = NULL;
3046 bool uncharge_memsw = true; 3038 bool uncharge_memsw = true;
3047 3039
3048 /* If swapout, usage of swap doesn't decrease */ 3040 /* If swapout, usage of swap doesn't decrease */
3049 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 3041 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
3050 uncharge_memsw = false; 3042 uncharge_memsw = false;
3051 3043
3052 batch = &current->memcg_batch; 3044 batch = &current->memcg_batch;
3053 /* 3045 /*
3054 * In usual, we do css_get() when we remember memcg pointer. 3046 * In usual, we do css_get() when we remember memcg pointer.
3055 * But in this case, we keep res->usage until end of a series of 3047 * But in this case, we keep res->usage until end of a series of
3056 * uncharges. Then, it's ok to ignore memcg's refcnt. 3048 * uncharges. Then, it's ok to ignore memcg's refcnt.
3057 */ 3049 */
3058 if (!batch->memcg) 3050 if (!batch->memcg)
3059 batch->memcg = mem; 3051 batch->memcg = mem;
3060 /* 3052 /*
3061 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 3053 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
3062 * In those cases, all pages freed continuously can be expected to be in 3054 * In those cases, all pages freed continuously can be expected to be in
3063 * the same cgroup and we have chance to coalesce uncharges. 3055 * the same cgroup and we have chance to coalesce uncharges.
3064 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 3056 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
3065 * because we want to do uncharge as soon as possible. 3057 * because we want to do uncharge as soon as possible.
3066 */ 3058 */
3067 3059
3068 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 3060 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
3069 goto direct_uncharge; 3061 goto direct_uncharge;
3070 3062
3071 if (nr_pages > 1) 3063 if (nr_pages > 1)
3072 goto direct_uncharge; 3064 goto direct_uncharge;
3073 3065
3074 /* 3066 /*
3075 * In typical case, batch->memcg == mem. This means we can 3067 * In typical case, batch->memcg == mem. This means we can
3076 * merge a series of uncharges to an uncharge of res_counter. 3068 * merge a series of uncharges to an uncharge of res_counter.
3077 * If not, we uncharge res_counter ony by one. 3069 * If not, we uncharge res_counter ony by one.
3078 */ 3070 */
3079 if (batch->memcg != mem) 3071 if (batch->memcg != mem)
3080 goto direct_uncharge; 3072 goto direct_uncharge;
3081 /* remember freed charge and uncharge it later */ 3073 /* remember freed charge and uncharge it later */
3082 batch->nr_pages++; 3074 batch->nr_pages++;
3083 if (uncharge_memsw) 3075 if (uncharge_memsw)
3084 batch->memsw_nr_pages++; 3076 batch->memsw_nr_pages++;
3085 return; 3077 return;
3086 direct_uncharge: 3078 direct_uncharge:
3087 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); 3079 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
3088 if (uncharge_memsw) 3080 if (uncharge_memsw)
3089 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); 3081 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
3090 if (unlikely(batch->memcg != mem)) 3082 if (unlikely(batch->memcg != mem))
3091 memcg_oom_recover(mem); 3083 memcg_oom_recover(mem);
3092 return; 3084 return;
3093 } 3085 }
3094 3086
3095 /* 3087 /*
3096 * uncharge if !page_mapped(page) 3088 * uncharge if !page_mapped(page)
3097 */ 3089 */
3098 static struct mem_cgroup * 3090 static struct mem_cgroup *
3099 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 3091 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
3100 { 3092 {
3101 struct mem_cgroup *mem = NULL; 3093 struct mem_cgroup *mem = NULL;
3102 unsigned int nr_pages = 1; 3094 unsigned int nr_pages = 1;
3103 struct page_cgroup *pc; 3095 struct page_cgroup *pc;
3104 3096
3105 if (mem_cgroup_disabled()) 3097 if (mem_cgroup_disabled())
3106 return NULL; 3098 return NULL;
3107 3099
3108 if (PageSwapCache(page)) 3100 if (PageSwapCache(page))
3109 return NULL; 3101 return NULL;
3110 3102
3111 if (PageTransHuge(page)) { 3103 if (PageTransHuge(page)) {
3112 nr_pages <<= compound_order(page); 3104 nr_pages <<= compound_order(page);
3113 VM_BUG_ON(!PageTransHuge(page)); 3105 VM_BUG_ON(!PageTransHuge(page));
3114 } 3106 }
3115 /* 3107 /*
3116 * Check if our page_cgroup is valid 3108 * Check if our page_cgroup is valid
3117 */ 3109 */
3118 pc = lookup_page_cgroup(page); 3110 pc = lookup_page_cgroup(page);
3119 if (unlikely(!pc || !PageCgroupUsed(pc))) 3111 if (unlikely(!pc || !PageCgroupUsed(pc)))
3120 return NULL; 3112 return NULL;
3121 3113
3122 lock_page_cgroup(pc); 3114 lock_page_cgroup(pc);
3123 3115
3124 mem = pc->mem_cgroup; 3116 mem = pc->mem_cgroup;
3125 3117
3126 if (!PageCgroupUsed(pc)) 3118 if (!PageCgroupUsed(pc))
3127 goto unlock_out; 3119 goto unlock_out;
3128 3120
3129 switch (ctype) { 3121 switch (ctype) {
3130 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 3122 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
3131 case MEM_CGROUP_CHARGE_TYPE_DROP: 3123 case MEM_CGROUP_CHARGE_TYPE_DROP:
3132 /* See mem_cgroup_prepare_migration() */ 3124 /* See mem_cgroup_prepare_migration() */
3133 if (page_mapped(page) || PageCgroupMigration(pc)) 3125 if (page_mapped(page) || PageCgroupMigration(pc))
3134 goto unlock_out; 3126 goto unlock_out;
3135 break; 3127 break;
3136 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 3128 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
3137 if (!PageAnon(page)) { /* Shared memory */ 3129 if (!PageAnon(page)) { /* Shared memory */
3138 if (page->mapping && !page_is_file_cache(page)) 3130 if (page->mapping && !page_is_file_cache(page))
3139 goto unlock_out; 3131 goto unlock_out;
3140 } else if (page_mapped(page)) /* Anon */ 3132 } else if (page_mapped(page)) /* Anon */
3141 goto unlock_out; 3133 goto unlock_out;
3142 break; 3134 break;
3143 default: 3135 default:
3144 break; 3136 break;
3145 } 3137 }
3146 3138
3147 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); 3139 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
3148 3140
3149 ClearPageCgroupUsed(pc); 3141 ClearPageCgroupUsed(pc);
3150 /* 3142 /*
3151 * pc->mem_cgroup is not cleared here. It will be accessed when it's 3143 * pc->mem_cgroup is not cleared here. It will be accessed when it's
3152 * freed from LRU. This is safe because uncharged page is expected not 3144 * freed from LRU. This is safe because uncharged page is expected not
3153 * to be reused (freed soon). Exception is SwapCache, it's handled by 3145 * to be reused (freed soon). Exception is SwapCache, it's handled by
3154 * special functions. 3146 * special functions.
3155 */ 3147 */
3156 3148
3157 unlock_page_cgroup(pc); 3149 unlock_page_cgroup(pc);
3158 /* 3150 /*
3159 * even after unlock, we have mem->res.usage here and this memcg 3151 * even after unlock, we have mem->res.usage here and this memcg
3160 * will never be freed. 3152 * will never be freed.
3161 */ 3153 */
3162 memcg_check_events(mem, page); 3154 memcg_check_events(mem, page);
3163 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 3155 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
3164 mem_cgroup_swap_statistics(mem, true); 3156 mem_cgroup_swap_statistics(mem, true);
3165 mem_cgroup_get(mem); 3157 mem_cgroup_get(mem);
3166 } 3158 }
3167 if (!mem_cgroup_is_root(mem)) 3159 if (!mem_cgroup_is_root(mem))
3168 mem_cgroup_do_uncharge(mem, nr_pages, ctype); 3160 mem_cgroup_do_uncharge(mem, nr_pages, ctype);
3169 3161
3170 return mem; 3162 return mem;
3171 3163
3172 unlock_out: 3164 unlock_out:
3173 unlock_page_cgroup(pc); 3165 unlock_page_cgroup(pc);
3174 return NULL; 3166 return NULL;
3175 } 3167 }
3176 3168
3177 void mem_cgroup_uncharge_page(struct page *page) 3169 void mem_cgroup_uncharge_page(struct page *page)
3178 { 3170 {
3179 /* early check. */ 3171 /* early check. */
3180 if (page_mapped(page)) 3172 if (page_mapped(page))
3181 return; 3173 return;
3182 if (page->mapping && !PageAnon(page)) 3174 if (page->mapping && !PageAnon(page))
3183 return; 3175 return;
3184 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 3176 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
3185 } 3177 }
3186 3178
3187 void mem_cgroup_uncharge_cache_page(struct page *page) 3179 void mem_cgroup_uncharge_cache_page(struct page *page)
3188 { 3180 {
3189 VM_BUG_ON(page_mapped(page)); 3181 VM_BUG_ON(page_mapped(page));
3190 VM_BUG_ON(page->mapping); 3182 VM_BUG_ON(page->mapping);
3191 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 3183 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
3192 } 3184 }
3193 3185
3194 /* 3186 /*
3195 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 3187 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
3196 * In that cases, pages are freed continuously and we can expect pages 3188 * In that cases, pages are freed continuously and we can expect pages
3197 * are in the same memcg. All these calls itself limits the number of 3189 * are in the same memcg. All these calls itself limits the number of
3198 * pages freed at once, then uncharge_start/end() is called properly. 3190 * pages freed at once, then uncharge_start/end() is called properly.
3199 * This may be called prural(2) times in a context, 3191 * This may be called prural(2) times in a context,
3200 */ 3192 */
3201 3193
3202 void mem_cgroup_uncharge_start(void) 3194 void mem_cgroup_uncharge_start(void)
3203 { 3195 {
3204 current->memcg_batch.do_batch++; 3196 current->memcg_batch.do_batch++;
3205 /* We can do nest. */ 3197 /* We can do nest. */
3206 if (current->memcg_batch.do_batch == 1) { 3198 if (current->memcg_batch.do_batch == 1) {
3207 current->memcg_batch.memcg = NULL; 3199 current->memcg_batch.memcg = NULL;
3208 current->memcg_batch.nr_pages = 0; 3200 current->memcg_batch.nr_pages = 0;
3209 current->memcg_batch.memsw_nr_pages = 0; 3201 current->memcg_batch.memsw_nr_pages = 0;
3210 } 3202 }
3211 } 3203 }
3212 3204
3213 void mem_cgroup_uncharge_end(void) 3205 void mem_cgroup_uncharge_end(void)
3214 { 3206 {
3215 struct memcg_batch_info *batch = &current->memcg_batch; 3207 struct memcg_batch_info *batch = &current->memcg_batch;
3216 3208
3217 if (!batch->do_batch) 3209 if (!batch->do_batch)
3218 return; 3210 return;
3219 3211
3220 batch->do_batch--; 3212 batch->do_batch--;
3221 if (batch->do_batch) /* If stacked, do nothing. */ 3213 if (batch->do_batch) /* If stacked, do nothing. */
3222 return; 3214 return;
3223 3215
3224 if (!batch->memcg) 3216 if (!batch->memcg)
3225 return; 3217 return;
3226 /* 3218 /*
3227 * This "batch->memcg" is valid without any css_get/put etc... 3219 * This "batch->memcg" is valid without any css_get/put etc...
3228 * bacause we hide charges behind us. 3220 * bacause we hide charges behind us.
3229 */ 3221 */
3230 if (batch->nr_pages) 3222 if (batch->nr_pages)
3231 res_counter_uncharge(&batch->memcg->res, 3223 res_counter_uncharge(&batch->memcg->res,
3232 batch->nr_pages * PAGE_SIZE); 3224 batch->nr_pages * PAGE_SIZE);
3233 if (batch->memsw_nr_pages) 3225 if (batch->memsw_nr_pages)
3234 res_counter_uncharge(&batch->memcg->memsw, 3226 res_counter_uncharge(&batch->memcg->memsw,
3235 batch->memsw_nr_pages * PAGE_SIZE); 3227 batch->memsw_nr_pages * PAGE_SIZE);
3236 memcg_oom_recover(batch->memcg); 3228 memcg_oom_recover(batch->memcg);
3237 /* forget this pointer (for sanity check) */ 3229 /* forget this pointer (for sanity check) */
3238 batch->memcg = NULL; 3230 batch->memcg = NULL;
3239 } 3231 }
3240 3232
3241 #ifdef CONFIG_SWAP 3233 #ifdef CONFIG_SWAP
3242 /* 3234 /*
3243 * called after __delete_from_swap_cache() and drop "page" account. 3235 * called after __delete_from_swap_cache() and drop "page" account.
3244 * memcg information is recorded to swap_cgroup of "ent" 3236 * memcg information is recorded to swap_cgroup of "ent"
3245 */ 3237 */
3246 void 3238 void
3247 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 3239 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
3248 { 3240 {
3249 struct mem_cgroup *memcg; 3241 struct mem_cgroup *memcg;
3250 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 3242 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
3251 3243
3252 if (!swapout) /* this was a swap cache but the swap is unused ! */ 3244 if (!swapout) /* this was a swap cache but the swap is unused ! */
3253 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 3245 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
3254 3246
3255 memcg = __mem_cgroup_uncharge_common(page, ctype); 3247 memcg = __mem_cgroup_uncharge_common(page, ctype);
3256 3248
3257 /* 3249 /*
3258 * record memcg information, if swapout && memcg != NULL, 3250 * record memcg information, if swapout && memcg != NULL,
3259 * mem_cgroup_get() was called in uncharge(). 3251 * mem_cgroup_get() was called in uncharge().
3260 */ 3252 */
3261 if (do_swap_account && swapout && memcg) 3253 if (do_swap_account && swapout && memcg)
3262 swap_cgroup_record(ent, css_id(&memcg->css)); 3254 swap_cgroup_record(ent, css_id(&memcg->css));
3263 } 3255 }
3264 #endif 3256 #endif
3265 3257
3266 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 3258 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
3267 /* 3259 /*
3268 * called from swap_entry_free(). remove record in swap_cgroup and 3260 * called from swap_entry_free(). remove record in swap_cgroup and
3269 * uncharge "memsw" account. 3261 * uncharge "memsw" account.
3270 */ 3262 */
3271 void mem_cgroup_uncharge_swap(swp_entry_t ent) 3263 void mem_cgroup_uncharge_swap(swp_entry_t ent)
3272 { 3264 {
3273 struct mem_cgroup *memcg; 3265 struct mem_cgroup *memcg;
3274 unsigned short id; 3266 unsigned short id;
3275 3267
3276 if (!do_swap_account) 3268 if (!do_swap_account)
3277 return; 3269 return;
3278 3270
3279 id = swap_cgroup_record(ent, 0); 3271 id = swap_cgroup_record(ent, 0);
3280 rcu_read_lock(); 3272 rcu_read_lock();
3281 memcg = mem_cgroup_lookup(id); 3273 memcg = mem_cgroup_lookup(id);
3282 if (memcg) { 3274 if (memcg) {
3283 /* 3275 /*
3284 * We uncharge this because swap is freed. 3276 * We uncharge this because swap is freed.
3285 * This memcg can be obsolete one. We avoid calling css_tryget 3277 * This memcg can be obsolete one. We avoid calling css_tryget
3286 */ 3278 */
3287 if (!mem_cgroup_is_root(memcg)) 3279 if (!mem_cgroup_is_root(memcg))
3288 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 3280 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
3289 mem_cgroup_swap_statistics(memcg, false); 3281 mem_cgroup_swap_statistics(memcg, false);
3290 mem_cgroup_put(memcg); 3282 mem_cgroup_put(memcg);
3291 } 3283 }
3292 rcu_read_unlock(); 3284 rcu_read_unlock();
3293 } 3285 }
3294 3286
3295 /** 3287 /**
3296 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 3288 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3297 * @entry: swap entry to be moved 3289 * @entry: swap entry to be moved
3298 * @from: mem_cgroup which the entry is moved from 3290 * @from: mem_cgroup which the entry is moved from
3299 * @to: mem_cgroup which the entry is moved to 3291 * @to: mem_cgroup which the entry is moved to
3300 * @need_fixup: whether we should fixup res_counters and refcounts. 3292 * @need_fixup: whether we should fixup res_counters and refcounts.
3301 * 3293 *
3302 * It succeeds only when the swap_cgroup's record for this entry is the same 3294 * It succeeds only when the swap_cgroup's record for this entry is the same
3303 * as the mem_cgroup's id of @from. 3295 * as the mem_cgroup's id of @from.
3304 * 3296 *
3305 * Returns 0 on success, -EINVAL on failure. 3297 * Returns 0 on success, -EINVAL on failure.
3306 * 3298 *
3307 * The caller must have charged to @to, IOW, called res_counter_charge() about 3299 * The caller must have charged to @to, IOW, called res_counter_charge() about
3308 * both res and memsw, and called css_get(). 3300 * both res and memsw, and called css_get().
3309 */ 3301 */
3310 static int mem_cgroup_move_swap_account(swp_entry_t entry, 3302 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3311 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3303 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3312 { 3304 {
3313 unsigned short old_id, new_id; 3305 unsigned short old_id, new_id;
3314 3306
3315 old_id = css_id(&from->css); 3307 old_id = css_id(&from->css);
3316 new_id = css_id(&to->css); 3308 new_id = css_id(&to->css);
3317 3309
3318 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 3310 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3319 mem_cgroup_swap_statistics(from, false); 3311 mem_cgroup_swap_statistics(from, false);
3320 mem_cgroup_swap_statistics(to, true); 3312 mem_cgroup_swap_statistics(to, true);
3321 /* 3313 /*
3322 * This function is only called from task migration context now. 3314 * This function is only called from task migration context now.
3323 * It postpones res_counter and refcount handling till the end 3315 * It postpones res_counter and refcount handling till the end
3324 * of task migration(mem_cgroup_clear_mc()) for performance 3316 * of task migration(mem_cgroup_clear_mc()) for performance
3325 * improvement. But we cannot postpone mem_cgroup_get(to) 3317 * improvement. But we cannot postpone mem_cgroup_get(to)
3326 * because if the process that has been moved to @to does 3318 * because if the process that has been moved to @to does
3327 * swap-in, the refcount of @to might be decreased to 0. 3319 * swap-in, the refcount of @to might be decreased to 0.
3328 */ 3320 */
3329 mem_cgroup_get(to); 3321 mem_cgroup_get(to);
3330 if (need_fixup) { 3322 if (need_fixup) {
3331 if (!mem_cgroup_is_root(from)) 3323 if (!mem_cgroup_is_root(from))
3332 res_counter_uncharge(&from->memsw, PAGE_SIZE); 3324 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3333 mem_cgroup_put(from); 3325 mem_cgroup_put(from);
3334 /* 3326 /*
3335 * we charged both to->res and to->memsw, so we should 3327 * we charged both to->res and to->memsw, so we should
3336 * uncharge to->res. 3328 * uncharge to->res.
3337 */ 3329 */
3338 if (!mem_cgroup_is_root(to)) 3330 if (!mem_cgroup_is_root(to))
3339 res_counter_uncharge(&to->res, PAGE_SIZE); 3331 res_counter_uncharge(&to->res, PAGE_SIZE);
3340 } 3332 }
3341 return 0; 3333 return 0;
3342 } 3334 }
3343 return -EINVAL; 3335 return -EINVAL;
3344 } 3336 }
3345 #else 3337 #else
3346 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3338 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3347 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3339 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
3348 { 3340 {
3349 return -EINVAL; 3341 return -EINVAL;
3350 } 3342 }
3351 #endif 3343 #endif
3352 3344
3353 /* 3345 /*
3354 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 3346 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
3355 * page belongs to. 3347 * page belongs to.
3356 */ 3348 */
3357 int mem_cgroup_prepare_migration(struct page *page, 3349 int mem_cgroup_prepare_migration(struct page *page,
3358 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 3350 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
3359 { 3351 {
3360 struct mem_cgroup *mem = NULL; 3352 struct mem_cgroup *mem = NULL;
3361 struct page_cgroup *pc; 3353 struct page_cgroup *pc;
3362 enum charge_type ctype; 3354 enum charge_type ctype;
3363 int ret = 0; 3355 int ret = 0;
3364 3356
3365 *ptr = NULL; 3357 *ptr = NULL;
3366 3358
3367 VM_BUG_ON(PageTransHuge(page)); 3359 VM_BUG_ON(PageTransHuge(page));
3368 if (mem_cgroup_disabled()) 3360 if (mem_cgroup_disabled())
3369 return 0; 3361 return 0;
3370 3362
3371 pc = lookup_page_cgroup(page); 3363 pc = lookup_page_cgroup(page);
3372 lock_page_cgroup(pc); 3364 lock_page_cgroup(pc);
3373 if (PageCgroupUsed(pc)) { 3365 if (PageCgroupUsed(pc)) {
3374 mem = pc->mem_cgroup; 3366 mem = pc->mem_cgroup;
3375 css_get(&mem->css); 3367 css_get(&mem->css);
3376 /* 3368 /*
3377 * At migrating an anonymous page, its mapcount goes down 3369 * At migrating an anonymous page, its mapcount goes down
3378 * to 0 and uncharge() will be called. But, even if it's fully 3370 * to 0 and uncharge() will be called. But, even if it's fully
3379 * unmapped, migration may fail and this page has to be 3371 * unmapped, migration may fail and this page has to be
3380 * charged again. We set MIGRATION flag here and delay uncharge 3372 * charged again. We set MIGRATION flag here and delay uncharge
3381 * until end_migration() is called 3373 * until end_migration() is called
3382 * 3374 *
3383 * Corner Case Thinking 3375 * Corner Case Thinking
3384 * A) 3376 * A)
3385 * When the old page was mapped as Anon and it's unmap-and-freed 3377 * When the old page was mapped as Anon and it's unmap-and-freed
3386 * while migration was ongoing. 3378 * while migration was ongoing.
3387 * If unmap finds the old page, uncharge() of it will be delayed 3379 * If unmap finds the old page, uncharge() of it will be delayed
3388 * until end_migration(). If unmap finds a new page, it's 3380 * until end_migration(). If unmap finds a new page, it's
3389 * uncharged when it make mapcount to be 1->0. If unmap code 3381 * uncharged when it make mapcount to be 1->0. If unmap code
3390 * finds swap_migration_entry, the new page will not be mapped 3382 * finds swap_migration_entry, the new page will not be mapped
3391 * and end_migration() will find it(mapcount==0). 3383 * and end_migration() will find it(mapcount==0).
3392 * 3384 *
3393 * B) 3385 * B)
3394 * When the old page was mapped but migraion fails, the kernel 3386 * When the old page was mapped but migraion fails, the kernel
3395 * remaps it. A charge for it is kept by MIGRATION flag even 3387 * remaps it. A charge for it is kept by MIGRATION flag even
3396 * if mapcount goes down to 0. We can do remap successfully 3388 * if mapcount goes down to 0. We can do remap successfully
3397 * without charging it again. 3389 * without charging it again.
3398 * 3390 *
3399 * C) 3391 * C)
3400 * The "old" page is under lock_page() until the end of 3392 * The "old" page is under lock_page() until the end of
3401 * migration, so, the old page itself will not be swapped-out. 3393 * migration, so, the old page itself will not be swapped-out.
3402 * If the new page is swapped out before end_migraton, our 3394 * If the new page is swapped out before end_migraton, our
3403 * hook to usual swap-out path will catch the event. 3395 * hook to usual swap-out path will catch the event.
3404 */ 3396 */
3405 if (PageAnon(page)) 3397 if (PageAnon(page))
3406 SetPageCgroupMigration(pc); 3398 SetPageCgroupMigration(pc);
3407 } 3399 }
3408 unlock_page_cgroup(pc); 3400 unlock_page_cgroup(pc);
3409 /* 3401 /*
3410 * If the page is not charged at this point, 3402 * If the page is not charged at this point,
3411 * we return here. 3403 * we return here.
3412 */ 3404 */
3413 if (!mem) 3405 if (!mem)
3414 return 0; 3406 return 0;
3415 3407
3416 *ptr = mem; 3408 *ptr = mem;
3417 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3409 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3418 css_put(&mem->css);/* drop extra refcnt */ 3410 css_put(&mem->css);/* drop extra refcnt */
3419 if (ret || *ptr == NULL) { 3411 if (ret || *ptr == NULL) {
3420 if (PageAnon(page)) { 3412 if (PageAnon(page)) {
3421 lock_page_cgroup(pc); 3413 lock_page_cgroup(pc);
3422 ClearPageCgroupMigration(pc); 3414 ClearPageCgroupMigration(pc);
3423 unlock_page_cgroup(pc); 3415 unlock_page_cgroup(pc);
3424 /* 3416 /*
3425 * The old page may be fully unmapped while we kept it. 3417 * The old page may be fully unmapped while we kept it.
3426 */ 3418 */
3427 mem_cgroup_uncharge_page(page); 3419 mem_cgroup_uncharge_page(page);
3428 } 3420 }
3429 return -ENOMEM; 3421 return -ENOMEM;
3430 } 3422 }
3431 /* 3423 /*
3432 * We charge new page before it's used/mapped. So, even if unlock_page() 3424 * We charge new page before it's used/mapped. So, even if unlock_page()
3433 * is called before end_migration, we can catch all events on this new 3425 * is called before end_migration, we can catch all events on this new
3434 * page. In the case new page is migrated but not remapped, new page's 3426 * page. In the case new page is migrated but not remapped, new page's
3435 * mapcount will be finally 0 and we call uncharge in end_migration(). 3427 * mapcount will be finally 0 and we call uncharge in end_migration().
3436 */ 3428 */
3437 pc = lookup_page_cgroup(newpage); 3429 pc = lookup_page_cgroup(newpage);
3438 if (PageAnon(page)) 3430 if (PageAnon(page))
3439 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3431 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3440 else if (page_is_file_cache(page)) 3432 else if (page_is_file_cache(page))
3441 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3433 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3442 else 3434 else
3443 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3435 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3444 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 3436 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
3445 return ret; 3437 return ret;
3446 } 3438 }
3447 3439
3448 /* remove redundant charge if migration failed*/ 3440 /* remove redundant charge if migration failed*/
3449 void mem_cgroup_end_migration(struct mem_cgroup *mem, 3441 void mem_cgroup_end_migration(struct mem_cgroup *mem,
3450 struct page *oldpage, struct page *newpage, bool migration_ok) 3442 struct page *oldpage, struct page *newpage, bool migration_ok)
3451 { 3443 {
3452 struct page *used, *unused; 3444 struct page *used, *unused;
3453 struct page_cgroup *pc; 3445 struct page_cgroup *pc;
3454 3446
3455 if (!mem) 3447 if (!mem)
3456 return; 3448 return;
3457 /* blocks rmdir() */ 3449 /* blocks rmdir() */
3458 cgroup_exclude_rmdir(&mem->css); 3450 cgroup_exclude_rmdir(&mem->css);
3459 if (!migration_ok) { 3451 if (!migration_ok) {
3460 used = oldpage; 3452 used = oldpage;
3461 unused = newpage; 3453 unused = newpage;
3462 } else { 3454 } else {
3463 used = newpage; 3455 used = newpage;
3464 unused = oldpage; 3456 unused = oldpage;
3465 } 3457 }
3466 /* 3458 /*
3467 * We disallowed uncharge of pages under migration because mapcount 3459 * We disallowed uncharge of pages under migration because mapcount
3468 * of the page goes down to zero, temporarly. 3460 * of the page goes down to zero, temporarly.
3469 * Clear the flag and check the page should be charged. 3461 * Clear the flag and check the page should be charged.
3470 */ 3462 */
3471 pc = lookup_page_cgroup(oldpage); 3463 pc = lookup_page_cgroup(oldpage);
3472 lock_page_cgroup(pc); 3464 lock_page_cgroup(pc);
3473 ClearPageCgroupMigration(pc); 3465 ClearPageCgroupMigration(pc);
3474 unlock_page_cgroup(pc); 3466 unlock_page_cgroup(pc);
3475 3467
3476 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3468 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3477 3469
3478 /* 3470 /*
3479 * If a page is a file cache, radix-tree replacement is very atomic 3471 * If a page is a file cache, radix-tree replacement is very atomic
3480 * and we can skip this check. When it was an Anon page, its mapcount 3472 * and we can skip this check. When it was an Anon page, its mapcount
3481 * goes down to 0. But because we added MIGRATION flage, it's not 3473 * goes down to 0. But because we added MIGRATION flage, it's not
3482 * uncharged yet. There are several case but page->mapcount check 3474 * uncharged yet. There are several case but page->mapcount check
3483 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3475 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3484 * check. (see prepare_charge() also) 3476 * check. (see prepare_charge() also)
3485 */ 3477 */
3486 if (PageAnon(used)) 3478 if (PageAnon(used))
3487 mem_cgroup_uncharge_page(used); 3479 mem_cgroup_uncharge_page(used);
3488 /* 3480 /*
3489 * At migration, we may charge account against cgroup which has no 3481 * At migration, we may charge account against cgroup which has no
3490 * tasks. 3482 * tasks.
3491 * So, rmdir()->pre_destroy() can be called while we do this charge. 3483 * So, rmdir()->pre_destroy() can be called while we do this charge.
3492 * In that case, we need to call pre_destroy() again. check it here. 3484 * In that case, we need to call pre_destroy() again. check it here.
3493 */ 3485 */
3494 cgroup_release_and_wakeup_rmdir(&mem->css); 3486 cgroup_release_and_wakeup_rmdir(&mem->css);
3495 } 3487 }
3496 3488
3497 /* 3489 /*
3498 * A call to try to shrink memory usage on charge failure at shmem's swapin. 3490 * A call to try to shrink memory usage on charge failure at shmem's swapin.
3499 * Calling hierarchical_reclaim is not enough because we should update 3491 * Calling hierarchical_reclaim is not enough because we should update
3500 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 3492 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
3501 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 3493 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
3502 * not from the memcg which this page would be charged to. 3494 * not from the memcg which this page would be charged to.
3503 * try_charge_swapin does all of these works properly. 3495 * try_charge_swapin does all of these works properly.
3504 */ 3496 */
3505 int mem_cgroup_shmem_charge_fallback(struct page *page, 3497 int mem_cgroup_shmem_charge_fallback(struct page *page,
3506 struct mm_struct *mm, 3498 struct mm_struct *mm,
3507 gfp_t gfp_mask) 3499 gfp_t gfp_mask)
3508 { 3500 {
3509 struct mem_cgroup *mem; 3501 struct mem_cgroup *mem;
3510 int ret; 3502 int ret;
3511 3503
3512 if (mem_cgroup_disabled()) 3504 if (mem_cgroup_disabled())
3513 return 0; 3505 return 0;
3514 3506
3515 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 3507 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
3516 if (!ret) 3508 if (!ret)
3517 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 3509 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
3518 3510
3519 return ret; 3511 return ret;
3520 } 3512 }
3521 3513
3522 #ifdef CONFIG_DEBUG_VM 3514 #ifdef CONFIG_DEBUG_VM
3523 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3515 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3524 { 3516 {
3525 struct page_cgroup *pc; 3517 struct page_cgroup *pc;
3526 3518
3527 pc = lookup_page_cgroup(page); 3519 pc = lookup_page_cgroup(page);
3528 if (likely(pc) && PageCgroupUsed(pc)) 3520 if (likely(pc) && PageCgroupUsed(pc))
3529 return pc; 3521 return pc;
3530 return NULL; 3522 return NULL;
3531 } 3523 }
3532 3524
3533 bool mem_cgroup_bad_page_check(struct page *page) 3525 bool mem_cgroup_bad_page_check(struct page *page)
3534 { 3526 {
3535 if (mem_cgroup_disabled()) 3527 if (mem_cgroup_disabled())
3536 return false; 3528 return false;
3537 3529
3538 return lookup_page_cgroup_used(page) != NULL; 3530 return lookup_page_cgroup_used(page) != NULL;
3539 } 3531 }
3540 3532
3541 void mem_cgroup_print_bad_page(struct page *page) 3533 void mem_cgroup_print_bad_page(struct page *page)
3542 { 3534 {
3543 struct page_cgroup *pc; 3535 struct page_cgroup *pc;
3544 3536
3545 pc = lookup_page_cgroup_used(page); 3537 pc = lookup_page_cgroup_used(page);
3546 if (pc) { 3538 if (pc) {
3547 int ret = -1; 3539 int ret = -1;
3548 char *path; 3540 char *path;
3549 3541
3550 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", 3542 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3551 pc, pc->flags, pc->mem_cgroup); 3543 pc, pc->flags, pc->mem_cgroup);
3552 3544
3553 path = kmalloc(PATH_MAX, GFP_KERNEL); 3545 path = kmalloc(PATH_MAX, GFP_KERNEL);
3554 if (path) { 3546 if (path) {
3555 rcu_read_lock(); 3547 rcu_read_lock();
3556 ret = cgroup_path(pc->mem_cgroup->css.cgroup, 3548 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3557 path, PATH_MAX); 3549 path, PATH_MAX);
3558 rcu_read_unlock(); 3550 rcu_read_unlock();
3559 } 3551 }
3560 3552
3561 printk(KERN_CONT "(%s)\n", 3553 printk(KERN_CONT "(%s)\n",
3562 (ret < 0) ? "cannot get the path" : path); 3554 (ret < 0) ? "cannot get the path" : path);
3563 kfree(path); 3555 kfree(path);
3564 } 3556 }
3565 } 3557 }
3566 #endif 3558 #endif
3567 3559
3568 static DEFINE_MUTEX(set_limit_mutex); 3560 static DEFINE_MUTEX(set_limit_mutex);
3569 3561
3570 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3562 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3571 unsigned long long val) 3563 unsigned long long val)
3572 { 3564 {
3573 int retry_count; 3565 int retry_count;
3574 u64 memswlimit, memlimit; 3566 u64 memswlimit, memlimit;
3575 int ret = 0; 3567 int ret = 0;
3576 int children = mem_cgroup_count_children(memcg); 3568 int children = mem_cgroup_count_children(memcg);
3577 u64 curusage, oldusage; 3569 u64 curusage, oldusage;
3578 int enlarge; 3570 int enlarge;
3579 3571
3580 /* 3572 /*
3581 * For keeping hierarchical_reclaim simple, how long we should retry 3573 * For keeping hierarchical_reclaim simple, how long we should retry
3582 * is depends on callers. We set our retry-count to be function 3574 * is depends on callers. We set our retry-count to be function
3583 * of # of children which we should visit in this loop. 3575 * of # of children which we should visit in this loop.
3584 */ 3576 */
3585 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3577 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3586 3578
3587 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3579 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3588 3580
3589 enlarge = 0; 3581 enlarge = 0;
3590 while (retry_count) { 3582 while (retry_count) {
3591 if (signal_pending(current)) { 3583 if (signal_pending(current)) {
3592 ret = -EINTR; 3584 ret = -EINTR;
3593 break; 3585 break;
3594 } 3586 }
3595 /* 3587 /*
3596 * Rather than hide all in some function, I do this in 3588 * Rather than hide all in some function, I do this in
3597 * open coded manner. You see what this really does. 3589 * open coded manner. You see what this really does.
3598 * We have to guarantee mem->res.limit < mem->memsw.limit. 3590 * We have to guarantee mem->res.limit < mem->memsw.limit.
3599 */ 3591 */
3600 mutex_lock(&set_limit_mutex); 3592 mutex_lock(&set_limit_mutex);
3601 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3593 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3602 if (memswlimit < val) { 3594 if (memswlimit < val) {
3603 ret = -EINVAL; 3595 ret = -EINVAL;
3604 mutex_unlock(&set_limit_mutex); 3596 mutex_unlock(&set_limit_mutex);
3605 break; 3597 break;
3606 } 3598 }
3607 3599
3608 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3600 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3609 if (memlimit < val) 3601 if (memlimit < val)
3610 enlarge = 1; 3602 enlarge = 1;
3611 3603
3612 ret = res_counter_set_limit(&memcg->res, val); 3604 ret = res_counter_set_limit(&memcg->res, val);
3613 if (!ret) { 3605 if (!ret) {
3614 if (memswlimit == val) 3606 if (memswlimit == val)
3615 memcg->memsw_is_minimum = true; 3607 memcg->memsw_is_minimum = true;
3616 else 3608 else
3617 memcg->memsw_is_minimum = false; 3609 memcg->memsw_is_minimum = false;
3618 } 3610 }
3619 mutex_unlock(&set_limit_mutex); 3611 mutex_unlock(&set_limit_mutex);
3620 3612
3621 if (!ret) 3613 if (!ret)
3622 break; 3614 break;
3623 3615
3624 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3616 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3625 MEM_CGROUP_RECLAIM_SHRINK, 3617 MEM_CGROUP_RECLAIM_SHRINK,
3626 NULL); 3618 NULL);
3627 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3619 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3628 /* Usage is reduced ? */ 3620 /* Usage is reduced ? */
3629 if (curusage >= oldusage) 3621 if (curusage >= oldusage)
3630 retry_count--; 3622 retry_count--;
3631 else 3623 else
3632 oldusage = curusage; 3624 oldusage = curusage;
3633 } 3625 }
3634 if (!ret && enlarge) 3626 if (!ret && enlarge)
3635 memcg_oom_recover(memcg); 3627 memcg_oom_recover(memcg);
3636 3628
3637 return ret; 3629 return ret;
3638 } 3630 }
3639 3631
3640 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3632 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3641 unsigned long long val) 3633 unsigned long long val)
3642 { 3634 {
3643 int retry_count; 3635 int retry_count;
3644 u64 memlimit, memswlimit, oldusage, curusage; 3636 u64 memlimit, memswlimit, oldusage, curusage;
3645 int children = mem_cgroup_count_children(memcg); 3637 int children = mem_cgroup_count_children(memcg);
3646 int ret = -EBUSY; 3638 int ret = -EBUSY;
3647 int enlarge = 0; 3639 int enlarge = 0;
3648 3640
3649 /* see mem_cgroup_resize_res_limit */ 3641 /* see mem_cgroup_resize_res_limit */
3650 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3642 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3651 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3643 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3652 while (retry_count) { 3644 while (retry_count) {
3653 if (signal_pending(current)) { 3645 if (signal_pending(current)) {
3654 ret = -EINTR; 3646 ret = -EINTR;
3655 break; 3647 break;
3656 } 3648 }
3657 /* 3649 /*
3658 * Rather than hide all in some function, I do this in 3650 * Rather than hide all in some function, I do this in
3659 * open coded manner. You see what this really does. 3651 * open coded manner. You see what this really does.
3660 * We have to guarantee mem->res.limit < mem->memsw.limit. 3652 * We have to guarantee mem->res.limit < mem->memsw.limit.
3661 */ 3653 */
3662 mutex_lock(&set_limit_mutex); 3654 mutex_lock(&set_limit_mutex);
3663 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3655 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3664 if (memlimit > val) { 3656 if (memlimit > val) {
3665 ret = -EINVAL; 3657 ret = -EINVAL;
3666 mutex_unlock(&set_limit_mutex); 3658 mutex_unlock(&set_limit_mutex);
3667 break; 3659 break;
3668 } 3660 }
3669 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3661 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3670 if (memswlimit < val) 3662 if (memswlimit < val)
3671 enlarge = 1; 3663 enlarge = 1;
3672 ret = res_counter_set_limit(&memcg->memsw, val); 3664 ret = res_counter_set_limit(&memcg->memsw, val);
3673 if (!ret) { 3665 if (!ret) {
3674 if (memlimit == val) 3666 if (memlimit == val)
3675 memcg->memsw_is_minimum = true; 3667 memcg->memsw_is_minimum = true;
3676 else 3668 else
3677 memcg->memsw_is_minimum = false; 3669 memcg->memsw_is_minimum = false;
3678 } 3670 }
3679 mutex_unlock(&set_limit_mutex); 3671 mutex_unlock(&set_limit_mutex);
3680 3672
3681 if (!ret) 3673 if (!ret)
3682 break; 3674 break;
3683 3675
3684 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3676 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3685 MEM_CGROUP_RECLAIM_NOSWAP | 3677 MEM_CGROUP_RECLAIM_NOSWAP |
3686 MEM_CGROUP_RECLAIM_SHRINK, 3678 MEM_CGROUP_RECLAIM_SHRINK,
3687 NULL); 3679 NULL);
3688 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3680 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3689 /* Usage is reduced ? */ 3681 /* Usage is reduced ? */
3690 if (curusage >= oldusage) 3682 if (curusage >= oldusage)
3691 retry_count--; 3683 retry_count--;
3692 else 3684 else
3693 oldusage = curusage; 3685 oldusage = curusage;
3694 } 3686 }
3695 if (!ret && enlarge) 3687 if (!ret && enlarge)
3696 memcg_oom_recover(memcg); 3688 memcg_oom_recover(memcg);
3697 return ret; 3689 return ret;
3698 } 3690 }
3699 3691
3700 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3692 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3701 gfp_t gfp_mask, 3693 gfp_t gfp_mask,
3702 unsigned long *total_scanned) 3694 unsigned long *total_scanned)
3703 { 3695 {
3704 unsigned long nr_reclaimed = 0; 3696 unsigned long nr_reclaimed = 0;
3705 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3697 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3706 unsigned long reclaimed; 3698 unsigned long reclaimed;
3707 int loop = 0; 3699 int loop = 0;
3708 struct mem_cgroup_tree_per_zone *mctz; 3700 struct mem_cgroup_tree_per_zone *mctz;
3709 unsigned long long excess; 3701 unsigned long long excess;
3710 unsigned long nr_scanned; 3702 unsigned long nr_scanned;
3711 3703
3712 if (order > 0) 3704 if (order > 0)
3713 return 0; 3705 return 0;
3714 3706
3715 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3707 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3716 /* 3708 /*
3717 * This loop can run a while, specially if mem_cgroup's continuously 3709 * This loop can run a while, specially if mem_cgroup's continuously
3718 * keep exceeding their soft limit and putting the system under 3710 * keep exceeding their soft limit and putting the system under
3719 * pressure 3711 * pressure
3720 */ 3712 */
3721 do { 3713 do {
3722 if (next_mz) 3714 if (next_mz)
3723 mz = next_mz; 3715 mz = next_mz;
3724 else 3716 else
3725 mz = mem_cgroup_largest_soft_limit_node(mctz); 3717 mz = mem_cgroup_largest_soft_limit_node(mctz);
3726 if (!mz) 3718 if (!mz)
3727 break; 3719 break;
3728 3720
3729 nr_scanned = 0; 3721 nr_scanned = 0;
3730 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3722 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3731 gfp_mask, 3723 gfp_mask,
3732 MEM_CGROUP_RECLAIM_SOFT, 3724 MEM_CGROUP_RECLAIM_SOFT,
3733 &nr_scanned); 3725 &nr_scanned);
3734 nr_reclaimed += reclaimed; 3726 nr_reclaimed += reclaimed;
3735 *total_scanned += nr_scanned; 3727 *total_scanned += nr_scanned;
3736 spin_lock(&mctz->lock); 3728 spin_lock(&mctz->lock);
3737 3729
3738 /* 3730 /*
3739 * If we failed to reclaim anything from this memory cgroup 3731 * If we failed to reclaim anything from this memory cgroup
3740 * it is time to move on to the next cgroup 3732 * it is time to move on to the next cgroup
3741 */ 3733 */
3742 next_mz = NULL; 3734 next_mz = NULL;
3743 if (!reclaimed) { 3735 if (!reclaimed) {
3744 do { 3736 do {
3745 /* 3737 /*
3746 * Loop until we find yet another one. 3738 * Loop until we find yet another one.
3747 * 3739 *
3748 * By the time we get the soft_limit lock 3740 * By the time we get the soft_limit lock
3749 * again, someone might have aded the 3741 * again, someone might have aded the
3750 * group back on the RB tree. Iterate to 3742 * group back on the RB tree. Iterate to
3751 * make sure we get a different mem. 3743 * make sure we get a different mem.
3752 * mem_cgroup_largest_soft_limit_node returns 3744 * mem_cgroup_largest_soft_limit_node returns
3753 * NULL if no other cgroup is present on 3745 * NULL if no other cgroup is present on
3754 * the tree 3746 * the tree
3755 */ 3747 */
3756 next_mz = 3748 next_mz =
3757 __mem_cgroup_largest_soft_limit_node(mctz); 3749 __mem_cgroup_largest_soft_limit_node(mctz);
3758 if (next_mz == mz) 3750 if (next_mz == mz)
3759 css_put(&next_mz->mem->css); 3751 css_put(&next_mz->mem->css);
3760 else /* next_mz == NULL or other memcg */ 3752 else /* next_mz == NULL or other memcg */
3761 break; 3753 break;
3762 } while (1); 3754 } while (1);
3763 } 3755 }
3764 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3756 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3765 excess = res_counter_soft_limit_excess(&mz->mem->res); 3757 excess = res_counter_soft_limit_excess(&mz->mem->res);
3766 /* 3758 /*
3767 * One school of thought says that we should not add 3759 * One school of thought says that we should not add
3768 * back the node to the tree if reclaim returns 0. 3760 * back the node to the tree if reclaim returns 0.
3769 * But our reclaim could return 0, simply because due 3761 * But our reclaim could return 0, simply because due
3770 * to priority we are exposing a smaller subset of 3762 * to priority we are exposing a smaller subset of
3771 * memory to reclaim from. Consider this as a longer 3763 * memory to reclaim from. Consider this as a longer
3772 * term TODO. 3764 * term TODO.
3773 */ 3765 */
3774 /* If excess == 0, no tree ops */ 3766 /* If excess == 0, no tree ops */
3775 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3767 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3776 spin_unlock(&mctz->lock); 3768 spin_unlock(&mctz->lock);
3777 css_put(&mz->mem->css); 3769 css_put(&mz->mem->css);
3778 loop++; 3770 loop++;
3779 /* 3771 /*
3780 * Could not reclaim anything and there are no more 3772 * Could not reclaim anything and there are no more
3781 * mem cgroups to try or we seem to be looping without 3773 * mem cgroups to try or we seem to be looping without
3782 * reclaiming anything. 3774 * reclaiming anything.
3783 */ 3775 */
3784 if (!nr_reclaimed && 3776 if (!nr_reclaimed &&
3785 (next_mz == NULL || 3777 (next_mz == NULL ||
3786 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3778 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3787 break; 3779 break;
3788 } while (!nr_reclaimed); 3780 } while (!nr_reclaimed);
3789 if (next_mz) 3781 if (next_mz)
3790 css_put(&next_mz->mem->css); 3782 css_put(&next_mz->mem->css);
3791 return nr_reclaimed; 3783 return nr_reclaimed;
3792 } 3784 }
3793 3785
3794 /* 3786 /*
3795 * This routine traverse page_cgroup in given list and drop them all. 3787 * This routine traverse page_cgroup in given list and drop them all.
3796 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3788 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3797 */ 3789 */
3798 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3790 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3799 int node, int zid, enum lru_list lru) 3791 int node, int zid, enum lru_list lru)
3800 { 3792 {
3801 struct zone *zone; 3793 struct zone *zone;
3802 struct mem_cgroup_per_zone *mz; 3794 struct mem_cgroup_per_zone *mz;
3803 struct page_cgroup *pc, *busy; 3795 struct page_cgroup *pc, *busy;
3804 unsigned long flags, loop; 3796 unsigned long flags, loop;
3805 struct list_head *list; 3797 struct list_head *list;
3806 int ret = 0; 3798 int ret = 0;
3807 3799
3808 zone = &NODE_DATA(node)->node_zones[zid]; 3800 zone = &NODE_DATA(node)->node_zones[zid];
3809 mz = mem_cgroup_zoneinfo(mem, node, zid); 3801 mz = mem_cgroup_zoneinfo(mem, node, zid);
3810 list = &mz->lists[lru]; 3802 list = &mz->lists[lru];
3811 3803
3812 loop = MEM_CGROUP_ZSTAT(mz, lru); 3804 loop = MEM_CGROUP_ZSTAT(mz, lru);
3813 /* give some margin against EBUSY etc...*/ 3805 /* give some margin against EBUSY etc...*/
3814 loop += 256; 3806 loop += 256;
3815 busy = NULL; 3807 busy = NULL;
3816 while (loop--) { 3808 while (loop--) {
3817 struct page *page; 3809 struct page *page;
3818 3810
3819 ret = 0; 3811 ret = 0;
3820 spin_lock_irqsave(&zone->lru_lock, flags); 3812 spin_lock_irqsave(&zone->lru_lock, flags);
3821 if (list_empty(list)) { 3813 if (list_empty(list)) {
3822 spin_unlock_irqrestore(&zone->lru_lock, flags); 3814 spin_unlock_irqrestore(&zone->lru_lock, flags);
3823 break; 3815 break;
3824 } 3816 }
3825 pc = list_entry(list->prev, struct page_cgroup, lru); 3817 pc = list_entry(list->prev, struct page_cgroup, lru);
3826 if (busy == pc) { 3818 if (busy == pc) {
3827 list_move(&pc->lru, list); 3819 list_move(&pc->lru, list);
3828 busy = NULL; 3820 busy = NULL;
3829 spin_unlock_irqrestore(&zone->lru_lock, flags); 3821 spin_unlock_irqrestore(&zone->lru_lock, flags);
3830 continue; 3822 continue;
3831 } 3823 }
3832 spin_unlock_irqrestore(&zone->lru_lock, flags); 3824 spin_unlock_irqrestore(&zone->lru_lock, flags);
3833 3825
3834 page = lookup_cgroup_page(pc); 3826 page = lookup_cgroup_page(pc);
3835 3827
3836 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); 3828 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
3837 if (ret == -ENOMEM) 3829 if (ret == -ENOMEM)
3838 break; 3830 break;
3839 3831
3840 if (ret == -EBUSY || ret == -EINVAL) { 3832 if (ret == -EBUSY || ret == -EINVAL) {
3841 /* found lock contention or "pc" is obsolete. */ 3833 /* found lock contention or "pc" is obsolete. */
3842 busy = pc; 3834 busy = pc;
3843 cond_resched(); 3835 cond_resched();
3844 } else 3836 } else
3845 busy = NULL; 3837 busy = NULL;
3846 } 3838 }
3847 3839
3848 if (!ret && !list_empty(list)) 3840 if (!ret && !list_empty(list))
3849 return -EBUSY; 3841 return -EBUSY;
3850 return ret; 3842 return ret;
3851 } 3843 }
3852 3844
3853 /* 3845 /*
3854 * make mem_cgroup's charge to be 0 if there is no task. 3846 * make mem_cgroup's charge to be 0 if there is no task.
3855 * This enables deleting this mem_cgroup. 3847 * This enables deleting this mem_cgroup.
3856 */ 3848 */
3857 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3849 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3858 { 3850 {
3859 int ret; 3851 int ret;
3860 int node, zid, shrink; 3852 int node, zid, shrink;
3861 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3853 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3862 struct cgroup *cgrp = mem->css.cgroup; 3854 struct cgroup *cgrp = mem->css.cgroup;
3863 3855
3864 css_get(&mem->css); 3856 css_get(&mem->css);
3865 3857
3866 shrink = 0; 3858 shrink = 0;
3867 /* should free all ? */ 3859 /* should free all ? */
3868 if (free_all) 3860 if (free_all)
3869 goto try_to_free; 3861 goto try_to_free;
3870 move_account: 3862 move_account:
3871 do { 3863 do {
3872 ret = -EBUSY; 3864 ret = -EBUSY;
3873 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3865 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3874 goto out; 3866 goto out;
3875 ret = -EINTR; 3867 ret = -EINTR;
3876 if (signal_pending(current)) 3868 if (signal_pending(current))
3877 goto out; 3869 goto out;
3878 /* This is for making all *used* pages to be on LRU. */ 3870 /* This is for making all *used* pages to be on LRU. */
3879 lru_add_drain_all(); 3871 lru_add_drain_all();
3880 drain_all_stock_sync(mem); 3872 drain_all_stock_sync(mem);
3881 ret = 0; 3873 ret = 0;
3882 mem_cgroup_start_move(mem); 3874 mem_cgroup_start_move(mem);
3883 for_each_node_state(node, N_HIGH_MEMORY) { 3875 for_each_node_state(node, N_HIGH_MEMORY) {
3884 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3876 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3885 enum lru_list l; 3877 enum lru_list l;
3886 for_each_lru(l) { 3878 for_each_lru(l) {
3887 ret = mem_cgroup_force_empty_list(mem, 3879 ret = mem_cgroup_force_empty_list(mem,
3888 node, zid, l); 3880 node, zid, l);
3889 if (ret) 3881 if (ret)
3890 break; 3882 break;
3891 } 3883 }
3892 } 3884 }
3893 if (ret) 3885 if (ret)
3894 break; 3886 break;
3895 } 3887 }
3896 mem_cgroup_end_move(mem); 3888 mem_cgroup_end_move(mem);
3897 memcg_oom_recover(mem); 3889 memcg_oom_recover(mem);
3898 /* it seems parent cgroup doesn't have enough mem */ 3890 /* it seems parent cgroup doesn't have enough mem */
3899 if (ret == -ENOMEM) 3891 if (ret == -ENOMEM)
3900 goto try_to_free; 3892 goto try_to_free;
3901 cond_resched(); 3893 cond_resched();
3902 /* "ret" should also be checked to ensure all lists are empty. */ 3894 /* "ret" should also be checked to ensure all lists are empty. */
3903 } while (mem->res.usage > 0 || ret); 3895 } while (mem->res.usage > 0 || ret);
3904 out: 3896 out:
3905 css_put(&mem->css); 3897 css_put(&mem->css);
3906 return ret; 3898 return ret;
3907 3899
3908 try_to_free: 3900 try_to_free:
3909 /* returns EBUSY if there is a task or if we come here twice. */ 3901 /* returns EBUSY if there is a task or if we come here twice. */
3910 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3902 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3911 ret = -EBUSY; 3903 ret = -EBUSY;
3912 goto out; 3904 goto out;
3913 } 3905 }
3914 /* we call try-to-free pages for make this cgroup empty */ 3906 /* we call try-to-free pages for make this cgroup empty */
3915 lru_add_drain_all(); 3907 lru_add_drain_all();
3916 /* try to free all pages in this cgroup */ 3908 /* try to free all pages in this cgroup */
3917 shrink = 1; 3909 shrink = 1;
3918 while (nr_retries && mem->res.usage > 0) { 3910 while (nr_retries && mem->res.usage > 0) {
3919 struct memcg_scanrecord rec; 3911 struct memcg_scanrecord rec;
3920 int progress; 3912 int progress;
3921 3913
3922 if (signal_pending(current)) { 3914 if (signal_pending(current)) {
3923 ret = -EINTR; 3915 ret = -EINTR;
3924 goto out; 3916 goto out;
3925 } 3917 }
3926 rec.context = SCAN_BY_SHRINK; 3918 rec.context = SCAN_BY_SHRINK;
3927 rec.mem = mem; 3919 rec.mem = mem;
3928 rec.root = mem; 3920 rec.root = mem;
3929 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3921 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3930 false, &rec); 3922 false, &rec);
3931 if (!progress) { 3923 if (!progress) {
3932 nr_retries--; 3924 nr_retries--;
3933 /* maybe some writeback is necessary */ 3925 /* maybe some writeback is necessary */
3934 congestion_wait(BLK_RW_ASYNC, HZ/10); 3926 congestion_wait(BLK_RW_ASYNC, HZ/10);
3935 } 3927 }
3936 3928
3937 } 3929 }
3938 lru_add_drain(); 3930 lru_add_drain();
3939 /* try move_account...there may be some *locked* pages. */ 3931 /* try move_account...there may be some *locked* pages. */
3940 goto move_account; 3932 goto move_account;
3941 } 3933 }
3942 3934
3943 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3935 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3944 { 3936 {
3945 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3937 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3946 } 3938 }
3947 3939
3948 3940
3949 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3941 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3950 { 3942 {
3951 return mem_cgroup_from_cont(cont)->use_hierarchy; 3943 return mem_cgroup_from_cont(cont)->use_hierarchy;
3952 } 3944 }
3953 3945
3954 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3946 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3955 u64 val) 3947 u64 val)
3956 { 3948 {
3957 int retval = 0; 3949 int retval = 0;
3958 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3950 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3959 struct cgroup *parent = cont->parent; 3951 struct cgroup *parent = cont->parent;
3960 struct mem_cgroup *parent_mem = NULL; 3952 struct mem_cgroup *parent_mem = NULL;
3961 3953
3962 if (parent) 3954 if (parent)
3963 parent_mem = mem_cgroup_from_cont(parent); 3955 parent_mem = mem_cgroup_from_cont(parent);
3964 3956
3965 cgroup_lock(); 3957 cgroup_lock();
3966 /* 3958 /*
3967 * If parent's use_hierarchy is set, we can't make any modifications 3959 * If parent's use_hierarchy is set, we can't make any modifications
3968 * in the child subtrees. If it is unset, then the change can 3960 * in the child subtrees. If it is unset, then the change can
3969 * occur, provided the current cgroup has no children. 3961 * occur, provided the current cgroup has no children.
3970 * 3962 *
3971 * For the root cgroup, parent_mem is NULL, we allow value to be 3963 * For the root cgroup, parent_mem is NULL, we allow value to be
3972 * set if there are no children. 3964 * set if there are no children.
3973 */ 3965 */
3974 if ((!parent_mem || !parent_mem->use_hierarchy) && 3966 if ((!parent_mem || !parent_mem->use_hierarchy) &&
3975 (val == 1 || val == 0)) { 3967 (val == 1 || val == 0)) {
3976 if (list_empty(&cont->children)) 3968 if (list_empty(&cont->children))
3977 mem->use_hierarchy = val; 3969 mem->use_hierarchy = val;
3978 else 3970 else
3979 retval = -EBUSY; 3971 retval = -EBUSY;
3980 } else 3972 } else
3981 retval = -EINVAL; 3973 retval = -EINVAL;
3982 cgroup_unlock(); 3974 cgroup_unlock();
3983 3975
3984 return retval; 3976 return retval;
3985 } 3977 }
3986 3978
3987 3979
3988 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, 3980 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3989 enum mem_cgroup_stat_index idx) 3981 enum mem_cgroup_stat_index idx)
3990 { 3982 {
3991 struct mem_cgroup *iter; 3983 struct mem_cgroup *iter;
3992 long val = 0; 3984 long val = 0;
3993 3985
3994 /* Per-cpu values can be negative, use a signed accumulator */ 3986 /* Per-cpu values can be negative, use a signed accumulator */
3995 for_each_mem_cgroup_tree(iter, mem) 3987 for_each_mem_cgroup_tree(iter, mem)
3996 val += mem_cgroup_read_stat(iter, idx); 3988 val += mem_cgroup_read_stat(iter, idx);
3997 3989
3998 if (val < 0) /* race ? */ 3990 if (val < 0) /* race ? */
3999 val = 0; 3991 val = 0;
4000 return val; 3992 return val;
4001 } 3993 }
4002 3994
4003 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3995 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
4004 { 3996 {
4005 u64 val; 3997 u64 val;
4006 3998
4007 if (!mem_cgroup_is_root(mem)) { 3999 if (!mem_cgroup_is_root(mem)) {
4008 if (!swap) 4000 if (!swap)
4009 return res_counter_read_u64(&mem->res, RES_USAGE); 4001 return res_counter_read_u64(&mem->res, RES_USAGE);
4010 else 4002 else
4011 return res_counter_read_u64(&mem->memsw, RES_USAGE); 4003 return res_counter_read_u64(&mem->memsw, RES_USAGE);
4012 } 4004 }
4013 4005
4014 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); 4006 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
4015 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); 4007 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
4016 4008
4017 if (swap) 4009 if (swap)
4018 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4010 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
4019 4011
4020 return val << PAGE_SHIFT; 4012 return val << PAGE_SHIFT;
4021 } 4013 }
4022 4014
4023 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 4015 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
4024 { 4016 {
4025 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4017 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4026 u64 val; 4018 u64 val;
4027 int type, name; 4019 int type, name;
4028 4020
4029 type = MEMFILE_TYPE(cft->private); 4021 type = MEMFILE_TYPE(cft->private);
4030 name = MEMFILE_ATTR(cft->private); 4022 name = MEMFILE_ATTR(cft->private);
4031 switch (type) { 4023 switch (type) {
4032 case _MEM: 4024 case _MEM:
4033 if (name == RES_USAGE) 4025 if (name == RES_USAGE)
4034 val = mem_cgroup_usage(mem, false); 4026 val = mem_cgroup_usage(mem, false);
4035 else 4027 else
4036 val = res_counter_read_u64(&mem->res, name); 4028 val = res_counter_read_u64(&mem->res, name);
4037 break; 4029 break;
4038 case _MEMSWAP: 4030 case _MEMSWAP:
4039 if (name == RES_USAGE) 4031 if (name == RES_USAGE)
4040 val = mem_cgroup_usage(mem, true); 4032 val = mem_cgroup_usage(mem, true);
4041 else 4033 else
4042 val = res_counter_read_u64(&mem->memsw, name); 4034 val = res_counter_read_u64(&mem->memsw, name);
4043 break; 4035 break;
4044 default: 4036 default:
4045 BUG(); 4037 BUG();
4046 break; 4038 break;
4047 } 4039 }
4048 return val; 4040 return val;
4049 } 4041 }
4050 /* 4042 /*
4051 * The user of this function is... 4043 * The user of this function is...
4052 * RES_LIMIT. 4044 * RES_LIMIT.
4053 */ 4045 */
4054 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 4046 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
4055 const char *buffer) 4047 const char *buffer)
4056 { 4048 {
4057 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4049 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4058 int type, name; 4050 int type, name;
4059 unsigned long long val; 4051 unsigned long long val;
4060 int ret; 4052 int ret;
4061 4053
4062 type = MEMFILE_TYPE(cft->private); 4054 type = MEMFILE_TYPE(cft->private);
4063 name = MEMFILE_ATTR(cft->private); 4055 name = MEMFILE_ATTR(cft->private);
4064 switch (name) { 4056 switch (name) {
4065 case RES_LIMIT: 4057 case RES_LIMIT:
4066 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 4058 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
4067 ret = -EINVAL; 4059 ret = -EINVAL;
4068 break; 4060 break;
4069 } 4061 }
4070 /* This function does all necessary parse...reuse it */ 4062 /* This function does all necessary parse...reuse it */
4071 ret = res_counter_memparse_write_strategy(buffer, &val); 4063 ret = res_counter_memparse_write_strategy(buffer, &val);
4072 if (ret) 4064 if (ret)
4073 break; 4065 break;
4074 if (type == _MEM) 4066 if (type == _MEM)
4075 ret = mem_cgroup_resize_limit(memcg, val); 4067 ret = mem_cgroup_resize_limit(memcg, val);
4076 else 4068 else
4077 ret = mem_cgroup_resize_memsw_limit(memcg, val); 4069 ret = mem_cgroup_resize_memsw_limit(memcg, val);
4078 break; 4070 break;
4079 case RES_SOFT_LIMIT: 4071 case RES_SOFT_LIMIT:
4080 ret = res_counter_memparse_write_strategy(buffer, &val); 4072 ret = res_counter_memparse_write_strategy(buffer, &val);
4081 if (ret) 4073 if (ret)
4082 break; 4074 break;
4083 /* 4075 /*
4084 * For memsw, soft limits are hard to implement in terms 4076 * For memsw, soft limits are hard to implement in terms
4085 * of semantics, for now, we support soft limits for 4077 * of semantics, for now, we support soft limits for
4086 * control without swap 4078 * control without swap
4087 */ 4079 */
4088 if (type == _MEM) 4080 if (type == _MEM)
4089 ret = res_counter_set_soft_limit(&memcg->res, val); 4081 ret = res_counter_set_soft_limit(&memcg->res, val);
4090 else 4082 else
4091 ret = -EINVAL; 4083 ret = -EINVAL;
4092 break; 4084 break;
4093 default: 4085 default:
4094 ret = -EINVAL; /* should be BUG() ? */ 4086 ret = -EINVAL; /* should be BUG() ? */
4095 break; 4087 break;
4096 } 4088 }
4097 return ret; 4089 return ret;
4098 } 4090 }
4099 4091
4100 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 4092 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
4101 unsigned long long *mem_limit, unsigned long long *memsw_limit) 4093 unsigned long long *mem_limit, unsigned long long *memsw_limit)
4102 { 4094 {
4103 struct cgroup *cgroup; 4095 struct cgroup *cgroup;
4104 unsigned long long min_limit, min_memsw_limit, tmp; 4096 unsigned long long min_limit, min_memsw_limit, tmp;
4105 4097
4106 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4098 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
4107 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4099 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4108 cgroup = memcg->css.cgroup; 4100 cgroup = memcg->css.cgroup;
4109 if (!memcg->use_hierarchy) 4101 if (!memcg->use_hierarchy)
4110 goto out; 4102 goto out;
4111 4103
4112 while (cgroup->parent) { 4104 while (cgroup->parent) {
4113 cgroup = cgroup->parent; 4105 cgroup = cgroup->parent;
4114 memcg = mem_cgroup_from_cont(cgroup); 4106 memcg = mem_cgroup_from_cont(cgroup);
4115 if (!memcg->use_hierarchy) 4107 if (!memcg->use_hierarchy)
4116 break; 4108 break;
4117 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 4109 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
4118 min_limit = min(min_limit, tmp); 4110 min_limit = min(min_limit, tmp);
4119 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4111 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
4120 min_memsw_limit = min(min_memsw_limit, tmp); 4112 min_memsw_limit = min(min_memsw_limit, tmp);
4121 } 4113 }
4122 out: 4114 out:
4123 *mem_limit = min_limit; 4115 *mem_limit = min_limit;
4124 *memsw_limit = min_memsw_limit; 4116 *memsw_limit = min_memsw_limit;
4125 return; 4117 return;
4126 } 4118 }
4127 4119
4128 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 4120 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
4129 { 4121 {
4130 struct mem_cgroup *mem; 4122 struct mem_cgroup *mem;
4131 int type, name; 4123 int type, name;
4132 4124
4133 mem = mem_cgroup_from_cont(cont); 4125 mem = mem_cgroup_from_cont(cont);
4134 type = MEMFILE_TYPE(event); 4126 type = MEMFILE_TYPE(event);
4135 name = MEMFILE_ATTR(event); 4127 name = MEMFILE_ATTR(event);
4136 switch (name) { 4128 switch (name) {
4137 case RES_MAX_USAGE: 4129 case RES_MAX_USAGE:
4138 if (type == _MEM) 4130 if (type == _MEM)
4139 res_counter_reset_max(&mem->res); 4131 res_counter_reset_max(&mem->res);
4140 else 4132 else
4141 res_counter_reset_max(&mem->memsw); 4133 res_counter_reset_max(&mem->memsw);
4142 break; 4134 break;
4143 case RES_FAILCNT: 4135 case RES_FAILCNT:
4144 if (type == _MEM) 4136 if (type == _MEM)
4145 res_counter_reset_failcnt(&mem->res); 4137 res_counter_reset_failcnt(&mem->res);
4146 else 4138 else
4147 res_counter_reset_failcnt(&mem->memsw); 4139 res_counter_reset_failcnt(&mem->memsw);
4148 break; 4140 break;
4149 } 4141 }
4150 4142
4151 return 0; 4143 return 0;
4152 } 4144 }
4153 4145
4154 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 4146 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
4155 struct cftype *cft) 4147 struct cftype *cft)
4156 { 4148 {
4157 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 4149 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
4158 } 4150 }
4159 4151
4160 #ifdef CONFIG_MMU 4152 #ifdef CONFIG_MMU
4161 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4153 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4162 struct cftype *cft, u64 val) 4154 struct cftype *cft, u64 val)
4163 { 4155 {
4164 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4156 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4165 4157
4166 if (val >= (1 << NR_MOVE_TYPE)) 4158 if (val >= (1 << NR_MOVE_TYPE))
4167 return -EINVAL; 4159 return -EINVAL;
4168 /* 4160 /*
4169 * We check this value several times in both in can_attach() and 4161 * We check this value several times in both in can_attach() and
4170 * attach(), so we need cgroup lock to prevent this value from being 4162 * attach(), so we need cgroup lock to prevent this value from being
4171 * inconsistent. 4163 * inconsistent.
4172 */ 4164 */
4173 cgroup_lock(); 4165 cgroup_lock();
4174 mem->move_charge_at_immigrate = val; 4166 mem->move_charge_at_immigrate = val;
4175 cgroup_unlock(); 4167 cgroup_unlock();
4176 4168
4177 return 0; 4169 return 0;
4178 } 4170 }
4179 #else 4171 #else
4180 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 4172 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4181 struct cftype *cft, u64 val) 4173 struct cftype *cft, u64 val)
4182 { 4174 {
4183 return -ENOSYS; 4175 return -ENOSYS;
4184 } 4176 }
4185 #endif 4177 #endif
4186 4178
4187 4179
4188 /* For read statistics */ 4180 /* For read statistics */
4189 enum { 4181 enum {
4190 MCS_CACHE, 4182 MCS_CACHE,
4191 MCS_RSS, 4183 MCS_RSS,
4192 MCS_FILE_MAPPED, 4184 MCS_FILE_MAPPED,
4193 MCS_PGPGIN, 4185 MCS_PGPGIN,
4194 MCS_PGPGOUT, 4186 MCS_PGPGOUT,
4195 MCS_SWAP, 4187 MCS_SWAP,
4196 MCS_PGFAULT, 4188 MCS_PGFAULT,
4197 MCS_PGMAJFAULT, 4189 MCS_PGMAJFAULT,
4198 MCS_INACTIVE_ANON, 4190 MCS_INACTIVE_ANON,
4199 MCS_ACTIVE_ANON, 4191 MCS_ACTIVE_ANON,
4200 MCS_INACTIVE_FILE, 4192 MCS_INACTIVE_FILE,
4201 MCS_ACTIVE_FILE, 4193 MCS_ACTIVE_FILE,
4202 MCS_UNEVICTABLE, 4194 MCS_UNEVICTABLE,
4203 NR_MCS_STAT, 4195 NR_MCS_STAT,
4204 }; 4196 };
4205 4197
4206 struct mcs_total_stat { 4198 struct mcs_total_stat {
4207 s64 stat[NR_MCS_STAT]; 4199 s64 stat[NR_MCS_STAT];
4208 }; 4200 };
4209 4201
4210 struct { 4202 struct {
4211 char *local_name; 4203 char *local_name;
4212 char *total_name; 4204 char *total_name;
4213 } memcg_stat_strings[NR_MCS_STAT] = { 4205 } memcg_stat_strings[NR_MCS_STAT] = {
4214 {"cache", "total_cache"}, 4206 {"cache", "total_cache"},
4215 {"rss", "total_rss"}, 4207 {"rss", "total_rss"},
4216 {"mapped_file", "total_mapped_file"}, 4208 {"mapped_file", "total_mapped_file"},
4217 {"pgpgin", "total_pgpgin"}, 4209 {"pgpgin", "total_pgpgin"},
4218 {"pgpgout", "total_pgpgout"}, 4210 {"pgpgout", "total_pgpgout"},
4219 {"swap", "total_swap"}, 4211 {"swap", "total_swap"},
4220 {"pgfault", "total_pgfault"}, 4212 {"pgfault", "total_pgfault"},
4221 {"pgmajfault", "total_pgmajfault"}, 4213 {"pgmajfault", "total_pgmajfault"},
4222 {"inactive_anon", "total_inactive_anon"}, 4214 {"inactive_anon", "total_inactive_anon"},
4223 {"active_anon", "total_active_anon"}, 4215 {"active_anon", "total_active_anon"},
4224 {"inactive_file", "total_inactive_file"}, 4216 {"inactive_file", "total_inactive_file"},
4225 {"active_file", "total_active_file"}, 4217 {"active_file", "total_active_file"},
4226 {"unevictable", "total_unevictable"} 4218 {"unevictable", "total_unevictable"}
4227 }; 4219 };
4228 4220
4229 4221
4230 static void 4222 static void
4231 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4223 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4232 { 4224 {
4233 s64 val; 4225 s64 val;
4234 4226
4235 /* per cpu stat */ 4227 /* per cpu stat */
4236 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 4228 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
4237 s->stat[MCS_CACHE] += val * PAGE_SIZE; 4229 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4238 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 4230 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
4239 s->stat[MCS_RSS] += val * PAGE_SIZE; 4231 s->stat[MCS_RSS] += val * PAGE_SIZE;
4240 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 4232 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
4241 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 4233 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4242 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); 4234 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
4243 s->stat[MCS_PGPGIN] += val; 4235 s->stat[MCS_PGPGIN] += val;
4244 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); 4236 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
4245 s->stat[MCS_PGPGOUT] += val; 4237 s->stat[MCS_PGPGOUT] += val;
4246 if (do_swap_account) { 4238 if (do_swap_account) {
4247 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 4239 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
4248 s->stat[MCS_SWAP] += val * PAGE_SIZE; 4240 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4249 } 4241 }
4250 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); 4242 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
4251 s->stat[MCS_PGFAULT] += val; 4243 s->stat[MCS_PGFAULT] += val;
4252 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); 4244 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
4253 s->stat[MCS_PGMAJFAULT] += val; 4245 s->stat[MCS_PGMAJFAULT] += val;
4254 4246
4255 /* per zone stat */ 4247 /* per zone stat */
4256 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); 4248 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
4257 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 4249 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4258 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); 4250 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
4259 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 4251 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4260 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); 4252 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
4261 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 4253 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4262 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); 4254 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
4263 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 4255 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4264 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); 4256 val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
4265 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 4257 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4266 } 4258 }
4267 4259
4268 static void 4260 static void
4269 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 4261 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
4270 { 4262 {
4271 struct mem_cgroup *iter; 4263 struct mem_cgroup *iter;
4272 4264
4273 for_each_mem_cgroup_tree(iter, mem) 4265 for_each_mem_cgroup_tree(iter, mem)
4274 mem_cgroup_get_local_stat(iter, s); 4266 mem_cgroup_get_local_stat(iter, s);
4275 } 4267 }
4276 4268
4277 #ifdef CONFIG_NUMA 4269 #ifdef CONFIG_NUMA
4278 static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4270 static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4279 { 4271 {
4280 int nid; 4272 int nid;
4281 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4273 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4282 unsigned long node_nr; 4274 unsigned long node_nr;
4283 struct cgroup *cont = m->private; 4275 struct cgroup *cont = m->private;
4284 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4276 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4285 4277
4286 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL); 4278 total_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL);
4287 seq_printf(m, "total=%lu", total_nr); 4279 seq_printf(m, "total=%lu", total_nr);
4288 for_each_node_state(nid, N_HIGH_MEMORY) { 4280 for_each_node_state(nid, N_HIGH_MEMORY) {
4289 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL); 4281 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, LRU_ALL);
4290 seq_printf(m, " N%d=%lu", nid, node_nr); 4282 seq_printf(m, " N%d=%lu", nid, node_nr);
4291 } 4283 }
4292 seq_putc(m, '\n'); 4284 seq_putc(m, '\n');
4293 4285
4294 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE); 4286 file_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_FILE);
4295 seq_printf(m, "file=%lu", file_nr); 4287 seq_printf(m, "file=%lu", file_nr);
4296 for_each_node_state(nid, N_HIGH_MEMORY) { 4288 for_each_node_state(nid, N_HIGH_MEMORY) {
4297 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4289 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4298 LRU_ALL_FILE); 4290 LRU_ALL_FILE);
4299 seq_printf(m, " N%d=%lu", nid, node_nr); 4291 seq_printf(m, " N%d=%lu", nid, node_nr);
4300 } 4292 }
4301 seq_putc(m, '\n'); 4293 seq_putc(m, '\n');
4302 4294
4303 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON); 4295 anon_nr = mem_cgroup_nr_lru_pages(mem_cont, LRU_ALL_ANON);
4304 seq_printf(m, "anon=%lu", anon_nr); 4296 seq_printf(m, "anon=%lu", anon_nr);
4305 for_each_node_state(nid, N_HIGH_MEMORY) { 4297 for_each_node_state(nid, N_HIGH_MEMORY) {
4306 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4298 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4307 LRU_ALL_ANON); 4299 LRU_ALL_ANON);
4308 seq_printf(m, " N%d=%lu", nid, node_nr); 4300 seq_printf(m, " N%d=%lu", nid, node_nr);
4309 } 4301 }
4310 seq_putc(m, '\n'); 4302 seq_putc(m, '\n');
4311 4303
4312 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE)); 4304 unevictable_nr = mem_cgroup_nr_lru_pages(mem_cont, BIT(LRU_UNEVICTABLE));
4313 seq_printf(m, "unevictable=%lu", unevictable_nr); 4305 seq_printf(m, "unevictable=%lu", unevictable_nr);
4314 for_each_node_state(nid, N_HIGH_MEMORY) { 4306 for_each_node_state(nid, N_HIGH_MEMORY) {
4315 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid, 4307 node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid,
4316 BIT(LRU_UNEVICTABLE)); 4308 BIT(LRU_UNEVICTABLE));
4317 seq_printf(m, " N%d=%lu", nid, node_nr); 4309 seq_printf(m, " N%d=%lu", nid, node_nr);
4318 } 4310 }
4319 seq_putc(m, '\n'); 4311 seq_putc(m, '\n');
4320 return 0; 4312 return 0;
4321 } 4313 }
4322 #endif /* CONFIG_NUMA */ 4314 #endif /* CONFIG_NUMA */
4323 4315
4324 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4316 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4325 struct cgroup_map_cb *cb) 4317 struct cgroup_map_cb *cb)
4326 { 4318 {
4327 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 4319 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
4328 struct mcs_total_stat mystat; 4320 struct mcs_total_stat mystat;
4329 int i; 4321 int i;
4330 4322
4331 memset(&mystat, 0, sizeof(mystat)); 4323 memset(&mystat, 0, sizeof(mystat));
4332 mem_cgroup_get_local_stat(mem_cont, &mystat); 4324 mem_cgroup_get_local_stat(mem_cont, &mystat);
4333 4325
4334 4326
4335 for (i = 0; i < NR_MCS_STAT; i++) { 4327 for (i = 0; i < NR_MCS_STAT; i++) {
4336 if (i == MCS_SWAP && !do_swap_account) 4328 if (i == MCS_SWAP && !do_swap_account)
4337 continue; 4329 continue;
4338 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4330 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
4339 } 4331 }
4340 4332
4341 /* Hierarchical information */ 4333 /* Hierarchical information */
4342 { 4334 {
4343 unsigned long long limit, memsw_limit; 4335 unsigned long long limit, memsw_limit;
4344 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 4336 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
4345 cb->fill(cb, "hierarchical_memory_limit", limit); 4337 cb->fill(cb, "hierarchical_memory_limit", limit);
4346 if (do_swap_account) 4338 if (do_swap_account)
4347 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4339 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
4348 } 4340 }
4349 4341
4350 memset(&mystat, 0, sizeof(mystat)); 4342 memset(&mystat, 0, sizeof(mystat));
4351 mem_cgroup_get_total_stat(mem_cont, &mystat); 4343 mem_cgroup_get_total_stat(mem_cont, &mystat);
4352 for (i = 0; i < NR_MCS_STAT; i++) { 4344 for (i = 0; i < NR_MCS_STAT; i++) {
4353 if (i == MCS_SWAP && !do_swap_account) 4345 if (i == MCS_SWAP && !do_swap_account)
4354 continue; 4346 continue;
4355 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4347 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
4356 } 4348 }
4357 4349
4358 #ifdef CONFIG_DEBUG_VM 4350 #ifdef CONFIG_DEBUG_VM
4359 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 4351 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
4360 4352
4361 { 4353 {
4362 int nid, zid; 4354 int nid, zid;
4363 struct mem_cgroup_per_zone *mz; 4355 struct mem_cgroup_per_zone *mz;
4364 unsigned long recent_rotated[2] = {0, 0}; 4356 unsigned long recent_rotated[2] = {0, 0};
4365 unsigned long recent_scanned[2] = {0, 0}; 4357 unsigned long recent_scanned[2] = {0, 0};
4366 4358
4367 for_each_online_node(nid) 4359 for_each_online_node(nid)
4368 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4360 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4369 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 4361 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
4370 4362
4371 recent_rotated[0] += 4363 recent_rotated[0] +=
4372 mz->reclaim_stat.recent_rotated[0]; 4364 mz->reclaim_stat.recent_rotated[0];
4373 recent_rotated[1] += 4365 recent_rotated[1] +=
4374 mz->reclaim_stat.recent_rotated[1]; 4366 mz->reclaim_stat.recent_rotated[1];
4375 recent_scanned[0] += 4367 recent_scanned[0] +=
4376 mz->reclaim_stat.recent_scanned[0]; 4368 mz->reclaim_stat.recent_scanned[0];
4377 recent_scanned[1] += 4369 recent_scanned[1] +=
4378 mz->reclaim_stat.recent_scanned[1]; 4370 mz->reclaim_stat.recent_scanned[1];
4379 } 4371 }
4380 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4372 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
4381 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4373 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
4382 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4374 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
4383 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4375 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
4384 } 4376 }
4385 #endif 4377 #endif
4386 4378
4387 return 0; 4379 return 0;
4388 } 4380 }
4389 4381
4390 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 4382 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
4391 { 4383 {
4392 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4384 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4393 4385
4394 return mem_cgroup_swappiness(memcg); 4386 return mem_cgroup_swappiness(memcg);
4395 } 4387 }
4396 4388
4397 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 4389 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
4398 u64 val) 4390 u64 val)
4399 { 4391 {
4400 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4392 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4401 struct mem_cgroup *parent; 4393 struct mem_cgroup *parent;
4402 4394
4403 if (val > 100) 4395 if (val > 100)
4404 return -EINVAL; 4396 return -EINVAL;
4405 4397
4406 if (cgrp->parent == NULL) 4398 if (cgrp->parent == NULL)
4407 return -EINVAL; 4399 return -EINVAL;
4408 4400
4409 parent = mem_cgroup_from_cont(cgrp->parent); 4401 parent = mem_cgroup_from_cont(cgrp->parent);
4410 4402
4411 cgroup_lock(); 4403 cgroup_lock();
4412 4404
4413 /* If under hierarchy, only empty-root can set this value */ 4405 /* If under hierarchy, only empty-root can set this value */
4414 if ((parent->use_hierarchy) || 4406 if ((parent->use_hierarchy) ||
4415 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 4407 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
4416 cgroup_unlock(); 4408 cgroup_unlock();
4417 return -EINVAL; 4409 return -EINVAL;
4418 } 4410 }
4419 4411
4420 memcg->swappiness = val; 4412 memcg->swappiness = val;
4421 4413
4422 cgroup_unlock(); 4414 cgroup_unlock();
4423 4415
4424 return 0; 4416 return 0;
4425 } 4417 }
4426 4418
4427 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 4419 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4428 { 4420 {
4429 struct mem_cgroup_threshold_ary *t; 4421 struct mem_cgroup_threshold_ary *t;
4430 u64 usage; 4422 u64 usage;
4431 int i; 4423 int i;
4432 4424
4433 rcu_read_lock(); 4425 rcu_read_lock();
4434 if (!swap) 4426 if (!swap)
4435 t = rcu_dereference(memcg->thresholds.primary); 4427 t = rcu_dereference(memcg->thresholds.primary);
4436 else 4428 else
4437 t = rcu_dereference(memcg->memsw_thresholds.primary); 4429 t = rcu_dereference(memcg->memsw_thresholds.primary);
4438 4430
4439 if (!t) 4431 if (!t)
4440 goto unlock; 4432 goto unlock;
4441 4433
4442 usage = mem_cgroup_usage(memcg, swap); 4434 usage = mem_cgroup_usage(memcg, swap);
4443 4435
4444 /* 4436 /*
4445 * current_threshold points to threshold just below usage. 4437 * current_threshold points to threshold just below usage.
4446 * If it's not true, a threshold was crossed after last 4438 * If it's not true, a threshold was crossed after last
4447 * call of __mem_cgroup_threshold(). 4439 * call of __mem_cgroup_threshold().
4448 */ 4440 */
4449 i = t->current_threshold; 4441 i = t->current_threshold;
4450 4442
4451 /* 4443 /*
4452 * Iterate backward over array of thresholds starting from 4444 * Iterate backward over array of thresholds starting from
4453 * current_threshold and check if a threshold is crossed. 4445 * current_threshold and check if a threshold is crossed.
4454 * If none of thresholds below usage is crossed, we read 4446 * If none of thresholds below usage is crossed, we read
4455 * only one element of the array here. 4447 * only one element of the array here.
4456 */ 4448 */
4457 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 4449 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4458 eventfd_signal(t->entries[i].eventfd, 1); 4450 eventfd_signal(t->entries[i].eventfd, 1);
4459 4451
4460 /* i = current_threshold + 1 */ 4452 /* i = current_threshold + 1 */
4461 i++; 4453 i++;
4462 4454
4463 /* 4455 /*
4464 * Iterate forward over array of thresholds starting from 4456 * Iterate forward over array of thresholds starting from
4465 * current_threshold+1 and check if a threshold is crossed. 4457 * current_threshold+1 and check if a threshold is crossed.
4466 * If none of thresholds above usage is crossed, we read 4458 * If none of thresholds above usage is crossed, we read
4467 * only one element of the array here. 4459 * only one element of the array here.
4468 */ 4460 */
4469 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4461 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4470 eventfd_signal(t->entries[i].eventfd, 1); 4462 eventfd_signal(t->entries[i].eventfd, 1);
4471 4463
4472 /* Update current_threshold */ 4464 /* Update current_threshold */
4473 t->current_threshold = i - 1; 4465 t->current_threshold = i - 1;
4474 unlock: 4466 unlock:
4475 rcu_read_unlock(); 4467 rcu_read_unlock();
4476 } 4468 }
4477 4469
4478 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4470 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4479 { 4471 {
4480 while (memcg) { 4472 while (memcg) {
4481 __mem_cgroup_threshold(memcg, false); 4473 __mem_cgroup_threshold(memcg, false);
4482 if (do_swap_account) 4474 if (do_swap_account)
4483 __mem_cgroup_threshold(memcg, true); 4475 __mem_cgroup_threshold(memcg, true);
4484 4476
4485 memcg = parent_mem_cgroup(memcg); 4477 memcg = parent_mem_cgroup(memcg);
4486 } 4478 }
4487 } 4479 }
4488 4480
4489 static int compare_thresholds(const void *a, const void *b) 4481 static int compare_thresholds(const void *a, const void *b)
4490 { 4482 {
4491 const struct mem_cgroup_threshold *_a = a; 4483 const struct mem_cgroup_threshold *_a = a;
4492 const struct mem_cgroup_threshold *_b = b; 4484 const struct mem_cgroup_threshold *_b = b;
4493 4485
4494 return _a->threshold - _b->threshold; 4486 return _a->threshold - _b->threshold;
4495 } 4487 }
4496 4488
4497 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 4489 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
4498 { 4490 {
4499 struct mem_cgroup_eventfd_list *ev; 4491 struct mem_cgroup_eventfd_list *ev;
4500 4492
4501 list_for_each_entry(ev, &mem->oom_notify, list) 4493 list_for_each_entry(ev, &mem->oom_notify, list)
4502 eventfd_signal(ev->eventfd, 1); 4494 eventfd_signal(ev->eventfd, 1);
4503 return 0; 4495 return 0;
4504 } 4496 }
4505 4497
4506 static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4498 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
4507 { 4499 {
4508 struct mem_cgroup *iter; 4500 struct mem_cgroup *iter;
4509 4501
4510 for_each_mem_cgroup_tree(iter, mem) 4502 for_each_mem_cgroup_tree(iter, mem)
4511 mem_cgroup_oom_notify_cb(iter); 4503 mem_cgroup_oom_notify_cb(iter);
4512 } 4504 }
4513 4505
4514 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4506 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4515 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4507 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4516 { 4508 {
4517 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4509 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4518 struct mem_cgroup_thresholds *thresholds; 4510 struct mem_cgroup_thresholds *thresholds;
4519 struct mem_cgroup_threshold_ary *new; 4511 struct mem_cgroup_threshold_ary *new;
4520 int type = MEMFILE_TYPE(cft->private); 4512 int type = MEMFILE_TYPE(cft->private);
4521 u64 threshold, usage; 4513 u64 threshold, usage;
4522 int i, size, ret; 4514 int i, size, ret;
4523 4515
4524 ret = res_counter_memparse_write_strategy(args, &threshold); 4516 ret = res_counter_memparse_write_strategy(args, &threshold);
4525 if (ret) 4517 if (ret)
4526 return ret; 4518 return ret;
4527 4519
4528 mutex_lock(&memcg->thresholds_lock); 4520 mutex_lock(&memcg->thresholds_lock);
4529 4521
4530 if (type == _MEM) 4522 if (type == _MEM)
4531 thresholds = &memcg->thresholds; 4523 thresholds = &memcg->thresholds;
4532 else if (type == _MEMSWAP) 4524 else if (type == _MEMSWAP)
4533 thresholds = &memcg->memsw_thresholds; 4525 thresholds = &memcg->memsw_thresholds;
4534 else 4526 else
4535 BUG(); 4527 BUG();
4536 4528
4537 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4529 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4538 4530
4539 /* Check if a threshold crossed before adding a new one */ 4531 /* Check if a threshold crossed before adding a new one */
4540 if (thresholds->primary) 4532 if (thresholds->primary)
4541 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4533 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4542 4534
4543 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4535 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4544 4536
4545 /* Allocate memory for new array of thresholds */ 4537 /* Allocate memory for new array of thresholds */
4546 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4538 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4547 GFP_KERNEL); 4539 GFP_KERNEL);
4548 if (!new) { 4540 if (!new) {
4549 ret = -ENOMEM; 4541 ret = -ENOMEM;
4550 goto unlock; 4542 goto unlock;
4551 } 4543 }
4552 new->size = size; 4544 new->size = size;
4553 4545
4554 /* Copy thresholds (if any) to new array */ 4546 /* Copy thresholds (if any) to new array */
4555 if (thresholds->primary) { 4547 if (thresholds->primary) {
4556 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4548 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4557 sizeof(struct mem_cgroup_threshold)); 4549 sizeof(struct mem_cgroup_threshold));
4558 } 4550 }
4559 4551
4560 /* Add new threshold */ 4552 /* Add new threshold */
4561 new->entries[size - 1].eventfd = eventfd; 4553 new->entries[size - 1].eventfd = eventfd;
4562 new->entries[size - 1].threshold = threshold; 4554 new->entries[size - 1].threshold = threshold;
4563 4555
4564 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4556 /* Sort thresholds. Registering of new threshold isn't time-critical */
4565 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4557 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4566 compare_thresholds, NULL); 4558 compare_thresholds, NULL);
4567 4559
4568 /* Find current threshold */ 4560 /* Find current threshold */
4569 new->current_threshold = -1; 4561 new->current_threshold = -1;
4570 for (i = 0; i < size; i++) { 4562 for (i = 0; i < size; i++) {
4571 if (new->entries[i].threshold < usage) { 4563 if (new->entries[i].threshold < usage) {
4572 /* 4564 /*
4573 * new->current_threshold will not be used until 4565 * new->current_threshold will not be used until
4574 * rcu_assign_pointer(), so it's safe to increment 4566 * rcu_assign_pointer(), so it's safe to increment
4575 * it here. 4567 * it here.
4576 */ 4568 */
4577 ++new->current_threshold; 4569 ++new->current_threshold;
4578 } 4570 }
4579 } 4571 }
4580 4572
4581 /* Free old spare buffer and save old primary buffer as spare */ 4573 /* Free old spare buffer and save old primary buffer as spare */
4582 kfree(thresholds->spare); 4574 kfree(thresholds->spare);
4583 thresholds->spare = thresholds->primary; 4575 thresholds->spare = thresholds->primary;
4584 4576
4585 rcu_assign_pointer(thresholds->primary, new); 4577 rcu_assign_pointer(thresholds->primary, new);
4586 4578
4587 /* To be sure that nobody uses thresholds */ 4579 /* To be sure that nobody uses thresholds */
4588 synchronize_rcu(); 4580 synchronize_rcu();
4589 4581
4590 unlock: 4582 unlock:
4591 mutex_unlock(&memcg->thresholds_lock); 4583 mutex_unlock(&memcg->thresholds_lock);
4592 4584
4593 return ret; 4585 return ret;
4594 } 4586 }
4595 4587
4596 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4588 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4597 struct cftype *cft, struct eventfd_ctx *eventfd) 4589 struct cftype *cft, struct eventfd_ctx *eventfd)
4598 { 4590 {
4599 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4591 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4600 struct mem_cgroup_thresholds *thresholds; 4592 struct mem_cgroup_thresholds *thresholds;
4601 struct mem_cgroup_threshold_ary *new; 4593 struct mem_cgroup_threshold_ary *new;
4602 int type = MEMFILE_TYPE(cft->private); 4594 int type = MEMFILE_TYPE(cft->private);
4603 u64 usage; 4595 u64 usage;
4604 int i, j, size; 4596 int i, j, size;
4605 4597
4606 mutex_lock(&memcg->thresholds_lock); 4598 mutex_lock(&memcg->thresholds_lock);
4607 if (type == _MEM) 4599 if (type == _MEM)
4608 thresholds = &memcg->thresholds; 4600 thresholds = &memcg->thresholds;
4609 else if (type == _MEMSWAP) 4601 else if (type == _MEMSWAP)
4610 thresholds = &memcg->memsw_thresholds; 4602 thresholds = &memcg->memsw_thresholds;
4611 else 4603 else
4612 BUG(); 4604 BUG();
4613 4605
4614 /* 4606 /*
4615 * Something went wrong if we trying to unregister a threshold 4607 * Something went wrong if we trying to unregister a threshold
4616 * if we don't have thresholds 4608 * if we don't have thresholds
4617 */ 4609 */
4618 BUG_ON(!thresholds); 4610 BUG_ON(!thresholds);
4619 4611
4620 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4612 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4621 4613
4622 /* Check if a threshold crossed before removing */ 4614 /* Check if a threshold crossed before removing */
4623 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4615 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4624 4616
4625 /* Calculate new number of threshold */ 4617 /* Calculate new number of threshold */
4626 size = 0; 4618 size = 0;
4627 for (i = 0; i < thresholds->primary->size; i++) { 4619 for (i = 0; i < thresholds->primary->size; i++) {
4628 if (thresholds->primary->entries[i].eventfd != eventfd) 4620 if (thresholds->primary->entries[i].eventfd != eventfd)
4629 size++; 4621 size++;
4630 } 4622 }
4631 4623
4632 new = thresholds->spare; 4624 new = thresholds->spare;
4633 4625
4634 /* Set thresholds array to NULL if we don't have thresholds */ 4626 /* Set thresholds array to NULL if we don't have thresholds */
4635 if (!size) { 4627 if (!size) {
4636 kfree(new); 4628 kfree(new);
4637 new = NULL; 4629 new = NULL;
4638 goto swap_buffers; 4630 goto swap_buffers;
4639 } 4631 }
4640 4632
4641 new->size = size; 4633 new->size = size;
4642 4634
4643 /* Copy thresholds and find current threshold */ 4635 /* Copy thresholds and find current threshold */
4644 new->current_threshold = -1; 4636 new->current_threshold = -1;
4645 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4637 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4646 if (thresholds->primary->entries[i].eventfd == eventfd) 4638 if (thresholds->primary->entries[i].eventfd == eventfd)
4647 continue; 4639 continue;
4648 4640
4649 new->entries[j] = thresholds->primary->entries[i]; 4641 new->entries[j] = thresholds->primary->entries[i];
4650 if (new->entries[j].threshold < usage) { 4642 if (new->entries[j].threshold < usage) {
4651 /* 4643 /*
4652 * new->current_threshold will not be used 4644 * new->current_threshold will not be used
4653 * until rcu_assign_pointer(), so it's safe to increment 4645 * until rcu_assign_pointer(), so it's safe to increment
4654 * it here. 4646 * it here.
4655 */ 4647 */
4656 ++new->current_threshold; 4648 ++new->current_threshold;
4657 } 4649 }
4658 j++; 4650 j++;
4659 } 4651 }
4660 4652
4661 swap_buffers: 4653 swap_buffers:
4662 /* Swap primary and spare array */ 4654 /* Swap primary and spare array */
4663 thresholds->spare = thresholds->primary; 4655 thresholds->spare = thresholds->primary;
4664 rcu_assign_pointer(thresholds->primary, new); 4656 rcu_assign_pointer(thresholds->primary, new);
4665 4657
4666 /* To be sure that nobody uses thresholds */ 4658 /* To be sure that nobody uses thresholds */
4667 synchronize_rcu(); 4659 synchronize_rcu();
4668 4660
4669 mutex_unlock(&memcg->thresholds_lock); 4661 mutex_unlock(&memcg->thresholds_lock);
4670 } 4662 }
4671 4663
4672 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4664 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4673 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4665 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4674 { 4666 {
4675 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4667 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4676 struct mem_cgroup_eventfd_list *event; 4668 struct mem_cgroup_eventfd_list *event;
4677 int type = MEMFILE_TYPE(cft->private); 4669 int type = MEMFILE_TYPE(cft->private);
4678 4670
4679 BUG_ON(type != _OOM_TYPE); 4671 BUG_ON(type != _OOM_TYPE);
4680 event = kmalloc(sizeof(*event), GFP_KERNEL); 4672 event = kmalloc(sizeof(*event), GFP_KERNEL);
4681 if (!event) 4673 if (!event)
4682 return -ENOMEM; 4674 return -ENOMEM;
4683 4675
4684 spin_lock(&memcg_oom_lock); 4676 spin_lock(&memcg_oom_lock);
4685 4677
4686 event->eventfd = eventfd; 4678 event->eventfd = eventfd;
4687 list_add(&event->list, &memcg->oom_notify); 4679 list_add(&event->list, &memcg->oom_notify);
4688 4680
4689 /* already in OOM ? */ 4681 /* already in OOM ? */
4690 if (atomic_read(&memcg->under_oom)) 4682 if (atomic_read(&memcg->under_oom))
4691 eventfd_signal(eventfd, 1); 4683 eventfd_signal(eventfd, 1);
4692 spin_unlock(&memcg_oom_lock); 4684 spin_unlock(&memcg_oom_lock);
4693 4685
4694 return 0; 4686 return 0;
4695 } 4687 }
4696 4688
4697 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4689 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4698 struct cftype *cft, struct eventfd_ctx *eventfd) 4690 struct cftype *cft, struct eventfd_ctx *eventfd)
4699 { 4691 {
4700 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4692 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4701 struct mem_cgroup_eventfd_list *ev, *tmp; 4693 struct mem_cgroup_eventfd_list *ev, *tmp;
4702 int type = MEMFILE_TYPE(cft->private); 4694 int type = MEMFILE_TYPE(cft->private);
4703 4695
4704 BUG_ON(type != _OOM_TYPE); 4696 BUG_ON(type != _OOM_TYPE);
4705 4697
4706 spin_lock(&memcg_oom_lock); 4698 spin_lock(&memcg_oom_lock);
4707 4699
4708 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4700 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4709 if (ev->eventfd == eventfd) { 4701 if (ev->eventfd == eventfd) {
4710 list_del(&ev->list); 4702 list_del(&ev->list);
4711 kfree(ev); 4703 kfree(ev);
4712 } 4704 }
4713 } 4705 }
4714 4706
4715 spin_unlock(&memcg_oom_lock); 4707 spin_unlock(&memcg_oom_lock);
4716 } 4708 }
4717 4709
4718 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4710 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4719 struct cftype *cft, struct cgroup_map_cb *cb) 4711 struct cftype *cft, struct cgroup_map_cb *cb)
4720 { 4712 {
4721 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4713 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4722 4714
4723 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4715 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4724 4716
4725 if (atomic_read(&mem->under_oom)) 4717 if (atomic_read(&mem->under_oom))
4726 cb->fill(cb, "under_oom", 1); 4718 cb->fill(cb, "under_oom", 1);
4727 else 4719 else
4728 cb->fill(cb, "under_oom", 0); 4720 cb->fill(cb, "under_oom", 0);
4729 return 0; 4721 return 0;
4730 } 4722 }
4731 4723
4732 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4724 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4733 struct cftype *cft, u64 val) 4725 struct cftype *cft, u64 val)
4734 { 4726 {
4735 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4727 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4736 struct mem_cgroup *parent; 4728 struct mem_cgroup *parent;
4737 4729
4738 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4730 /* cannot set to root cgroup and only 0 and 1 are allowed */
4739 if (!cgrp->parent || !((val == 0) || (val == 1))) 4731 if (!cgrp->parent || !((val == 0) || (val == 1)))
4740 return -EINVAL; 4732 return -EINVAL;
4741 4733
4742 parent = mem_cgroup_from_cont(cgrp->parent); 4734 parent = mem_cgroup_from_cont(cgrp->parent);
4743 4735
4744 cgroup_lock(); 4736 cgroup_lock();
4745 /* oom-kill-disable is a flag for subhierarchy. */ 4737 /* oom-kill-disable is a flag for subhierarchy. */
4746 if ((parent->use_hierarchy) || 4738 if ((parent->use_hierarchy) ||
4747 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4739 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
4748 cgroup_unlock(); 4740 cgroup_unlock();
4749 return -EINVAL; 4741 return -EINVAL;
4750 } 4742 }
4751 mem->oom_kill_disable = val; 4743 mem->oom_kill_disable = val;
4752 if (!val) 4744 if (!val)
4753 memcg_oom_recover(mem); 4745 memcg_oom_recover(mem);
4754 cgroup_unlock(); 4746 cgroup_unlock();
4755 return 0; 4747 return 0;
4756 } 4748 }
4757 4749
4758 #ifdef CONFIG_NUMA 4750 #ifdef CONFIG_NUMA
4759 static const struct file_operations mem_control_numa_stat_file_operations = { 4751 static const struct file_operations mem_control_numa_stat_file_operations = {
4760 .read = seq_read, 4752 .read = seq_read,
4761 .llseek = seq_lseek, 4753 .llseek = seq_lseek,
4762 .release = single_release, 4754 .release = single_release,
4763 }; 4755 };
4764 4756
4765 static int mem_control_numa_stat_open(struct inode *unused, struct file *file) 4757 static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4766 { 4758 {
4767 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; 4759 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4768 4760
4769 file->f_op = &mem_control_numa_stat_file_operations; 4761 file->f_op = &mem_control_numa_stat_file_operations;
4770 return single_open(file, mem_control_numa_stat_show, cont); 4762 return single_open(file, mem_control_numa_stat_show, cont);
4771 } 4763 }
4772 #endif /* CONFIG_NUMA */ 4764 #endif /* CONFIG_NUMA */
4773 4765
4774 static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, 4766 static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
4775 struct cftype *cft, 4767 struct cftype *cft,
4776 struct cgroup_map_cb *cb) 4768 struct cgroup_map_cb *cb)
4777 { 4769 {
4778 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4770 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4779 char string[64]; 4771 char string[64];
4780 int i; 4772 int i;
4781 4773
4782 for (i = 0; i < NR_SCANSTATS; i++) { 4774 for (i = 0; i < NR_SCANSTATS; i++) {
4783 strcpy(string, scanstat_string[i]); 4775 strcpy(string, scanstat_string[i]);
4784 strcat(string, SCANSTAT_WORD_LIMIT); 4776 strcat(string, SCANSTAT_WORD_LIMIT);
4785 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); 4777 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]);
4786 } 4778 }
4787 4779
4788 for (i = 0; i < NR_SCANSTATS; i++) { 4780 for (i = 0; i < NR_SCANSTATS; i++) {
4789 strcpy(string, scanstat_string[i]); 4781 strcpy(string, scanstat_string[i]);
4790 strcat(string, SCANSTAT_WORD_SYSTEM); 4782 strcat(string, SCANSTAT_WORD_SYSTEM);
4791 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); 4783 cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
4792 } 4784 }
4793 4785
4794 for (i = 0; i < NR_SCANSTATS; i++) { 4786 for (i = 0; i < NR_SCANSTATS; i++) {
4795 strcpy(string, scanstat_string[i]); 4787 strcpy(string, scanstat_string[i]);
4796 strcat(string, SCANSTAT_WORD_LIMIT); 4788 strcat(string, SCANSTAT_WORD_LIMIT);
4797 strcat(string, SCANSTAT_WORD_HIERARCHY); 4789 strcat(string, SCANSTAT_WORD_HIERARCHY);
4798 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); 4790 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
4799 } 4791 }
4800 for (i = 0; i < NR_SCANSTATS; i++) { 4792 for (i = 0; i < NR_SCANSTATS; i++) {
4801 strcpy(string, scanstat_string[i]); 4793 strcpy(string, scanstat_string[i]);
4802 strcat(string, SCANSTAT_WORD_SYSTEM); 4794 strcat(string, SCANSTAT_WORD_SYSTEM);
4803 strcat(string, SCANSTAT_WORD_HIERARCHY); 4795 strcat(string, SCANSTAT_WORD_HIERARCHY);
4804 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); 4796 cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
4805 } 4797 }
4806 return 0; 4798 return 0;
4807 } 4799 }
4808 4800
4809 static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, 4801 static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
4810 unsigned int event) 4802 unsigned int event)
4811 { 4803 {
4812 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4804 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4813 4805
4814 spin_lock(&mem->scanstat.lock); 4806 spin_lock(&mem->scanstat.lock);
4815 memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); 4807 memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
4816 memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); 4808 memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
4817 spin_unlock(&mem->scanstat.lock); 4809 spin_unlock(&mem->scanstat.lock);
4818 return 0; 4810 return 0;
4819 } 4811 }
4820 4812
4821 4813
4822 static struct cftype mem_cgroup_files[] = { 4814 static struct cftype mem_cgroup_files[] = {
4823 { 4815 {
4824 .name = "usage_in_bytes", 4816 .name = "usage_in_bytes",
4825 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4817 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4826 .read_u64 = mem_cgroup_read, 4818 .read_u64 = mem_cgroup_read,
4827 .register_event = mem_cgroup_usage_register_event, 4819 .register_event = mem_cgroup_usage_register_event,
4828 .unregister_event = mem_cgroup_usage_unregister_event, 4820 .unregister_event = mem_cgroup_usage_unregister_event,
4829 }, 4821 },
4830 { 4822 {
4831 .name = "max_usage_in_bytes", 4823 .name = "max_usage_in_bytes",
4832 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4824 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4833 .trigger = mem_cgroup_reset, 4825 .trigger = mem_cgroup_reset,
4834 .read_u64 = mem_cgroup_read, 4826 .read_u64 = mem_cgroup_read,
4835 }, 4827 },
4836 { 4828 {
4837 .name = "limit_in_bytes", 4829 .name = "limit_in_bytes",
4838 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4830 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4839 .write_string = mem_cgroup_write, 4831 .write_string = mem_cgroup_write,
4840 .read_u64 = mem_cgroup_read, 4832 .read_u64 = mem_cgroup_read,
4841 }, 4833 },
4842 { 4834 {
4843 .name = "soft_limit_in_bytes", 4835 .name = "soft_limit_in_bytes",
4844 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4836 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4845 .write_string = mem_cgroup_write, 4837 .write_string = mem_cgroup_write,
4846 .read_u64 = mem_cgroup_read, 4838 .read_u64 = mem_cgroup_read,
4847 }, 4839 },
4848 { 4840 {
4849 .name = "failcnt", 4841 .name = "failcnt",
4850 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4842 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4851 .trigger = mem_cgroup_reset, 4843 .trigger = mem_cgroup_reset,
4852 .read_u64 = mem_cgroup_read, 4844 .read_u64 = mem_cgroup_read,
4853 }, 4845 },
4854 { 4846 {
4855 .name = "stat", 4847 .name = "stat",
4856 .read_map = mem_control_stat_show, 4848 .read_map = mem_control_stat_show,
4857 }, 4849 },
4858 { 4850 {
4859 .name = "force_empty", 4851 .name = "force_empty",
4860 .trigger = mem_cgroup_force_empty_write, 4852 .trigger = mem_cgroup_force_empty_write,
4861 }, 4853 },
4862 { 4854 {
4863 .name = "use_hierarchy", 4855 .name = "use_hierarchy",
4864 .write_u64 = mem_cgroup_hierarchy_write, 4856 .write_u64 = mem_cgroup_hierarchy_write,
4865 .read_u64 = mem_cgroup_hierarchy_read, 4857 .read_u64 = mem_cgroup_hierarchy_read,
4866 }, 4858 },
4867 { 4859 {
4868 .name = "swappiness", 4860 .name = "swappiness",
4869 .read_u64 = mem_cgroup_swappiness_read, 4861 .read_u64 = mem_cgroup_swappiness_read,
4870 .write_u64 = mem_cgroup_swappiness_write, 4862 .write_u64 = mem_cgroup_swappiness_write,
4871 }, 4863 },
4872 { 4864 {
4873 .name = "move_charge_at_immigrate", 4865 .name = "move_charge_at_immigrate",
4874 .read_u64 = mem_cgroup_move_charge_read, 4866 .read_u64 = mem_cgroup_move_charge_read,
4875 .write_u64 = mem_cgroup_move_charge_write, 4867 .write_u64 = mem_cgroup_move_charge_write,
4876 }, 4868 },
4877 { 4869 {
4878 .name = "oom_control", 4870 .name = "oom_control",
4879 .read_map = mem_cgroup_oom_control_read, 4871 .read_map = mem_cgroup_oom_control_read,
4880 .write_u64 = mem_cgroup_oom_control_write, 4872 .write_u64 = mem_cgroup_oom_control_write,
4881 .register_event = mem_cgroup_oom_register_event, 4873 .register_event = mem_cgroup_oom_register_event,
4882 .unregister_event = mem_cgroup_oom_unregister_event, 4874 .unregister_event = mem_cgroup_oom_unregister_event,
4883 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4875 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4884 }, 4876 },
4885 #ifdef CONFIG_NUMA 4877 #ifdef CONFIG_NUMA
4886 { 4878 {
4887 .name = "numa_stat", 4879 .name = "numa_stat",
4888 .open = mem_control_numa_stat_open, 4880 .open = mem_control_numa_stat_open,
4889 .mode = S_IRUGO, 4881 .mode = S_IRUGO,
4890 }, 4882 },
4891 #endif 4883 #endif
4892 { 4884 {
4893 .name = "vmscan_stat", 4885 .name = "vmscan_stat",
4894 .read_map = mem_cgroup_vmscan_stat_read, 4886 .read_map = mem_cgroup_vmscan_stat_read,
4895 .trigger = mem_cgroup_reset_vmscan_stat, 4887 .trigger = mem_cgroup_reset_vmscan_stat,
4896 }, 4888 },
4897 }; 4889 };
4898 4890
4899 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4891 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4900 static struct cftype memsw_cgroup_files[] = { 4892 static struct cftype memsw_cgroup_files[] = {
4901 { 4893 {
4902 .name = "memsw.usage_in_bytes", 4894 .name = "memsw.usage_in_bytes",
4903 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4895 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4904 .read_u64 = mem_cgroup_read, 4896 .read_u64 = mem_cgroup_read,
4905 .register_event = mem_cgroup_usage_register_event, 4897 .register_event = mem_cgroup_usage_register_event,
4906 .unregister_event = mem_cgroup_usage_unregister_event, 4898 .unregister_event = mem_cgroup_usage_unregister_event,
4907 }, 4899 },
4908 { 4900 {
4909 .name = "memsw.max_usage_in_bytes", 4901 .name = "memsw.max_usage_in_bytes",
4910 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4902 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4911 .trigger = mem_cgroup_reset, 4903 .trigger = mem_cgroup_reset,
4912 .read_u64 = mem_cgroup_read, 4904 .read_u64 = mem_cgroup_read,
4913 }, 4905 },
4914 { 4906 {
4915 .name = "memsw.limit_in_bytes", 4907 .name = "memsw.limit_in_bytes",
4916 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4908 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4917 .write_string = mem_cgroup_write, 4909 .write_string = mem_cgroup_write,
4918 .read_u64 = mem_cgroup_read, 4910 .read_u64 = mem_cgroup_read,
4919 }, 4911 },
4920 { 4912 {
4921 .name = "memsw.failcnt", 4913 .name = "memsw.failcnt",
4922 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4914 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4923 .trigger = mem_cgroup_reset, 4915 .trigger = mem_cgroup_reset,
4924 .read_u64 = mem_cgroup_read, 4916 .read_u64 = mem_cgroup_read,
4925 }, 4917 },
4926 }; 4918 };
4927 4919
4928 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4920 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4929 { 4921 {
4930 if (!do_swap_account) 4922 if (!do_swap_account)
4931 return 0; 4923 return 0;
4932 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4924 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4933 ARRAY_SIZE(memsw_cgroup_files)); 4925 ARRAY_SIZE(memsw_cgroup_files));
4934 }; 4926 };
4935 #else 4927 #else
4936 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4928 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4937 { 4929 {
4938 return 0; 4930 return 0;
4939 } 4931 }
4940 #endif 4932 #endif
4941 4933
4942 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4934 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4943 { 4935 {
4944 struct mem_cgroup_per_node *pn; 4936 struct mem_cgroup_per_node *pn;
4945 struct mem_cgroup_per_zone *mz; 4937 struct mem_cgroup_per_zone *mz;
4946 enum lru_list l; 4938 enum lru_list l;
4947 int zone, tmp = node; 4939 int zone, tmp = node;
4948 /* 4940 /*
4949 * This routine is called against possible nodes. 4941 * This routine is called against possible nodes.
4950 * But it's BUG to call kmalloc() against offline node. 4942 * But it's BUG to call kmalloc() against offline node.
4951 * 4943 *
4952 * TODO: this routine can waste much memory for nodes which will 4944 * TODO: this routine can waste much memory for nodes which will
4953 * never be onlined. It's better to use memory hotplug callback 4945 * never be onlined. It's better to use memory hotplug callback
4954 * function. 4946 * function.
4955 */ 4947 */
4956 if (!node_state(node, N_NORMAL_MEMORY)) 4948 if (!node_state(node, N_NORMAL_MEMORY))
4957 tmp = -1; 4949 tmp = -1;
4958 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4950 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4959 if (!pn) 4951 if (!pn)
4960 return 1; 4952 return 1;
4961 4953
4962 mem->info.nodeinfo[node] = pn; 4954 mem->info.nodeinfo[node] = pn;
4963 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4955 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4964 mz = &pn->zoneinfo[zone]; 4956 mz = &pn->zoneinfo[zone];
4965 for_each_lru(l) 4957 for_each_lru(l)
4966 INIT_LIST_HEAD(&mz->lists[l]); 4958 INIT_LIST_HEAD(&mz->lists[l]);
4967 mz->usage_in_excess = 0; 4959 mz->usage_in_excess = 0;
4968 mz->on_tree = false; 4960 mz->on_tree = false;
4969 mz->mem = mem; 4961 mz->mem = mem;
4970 } 4962 }
4971 return 0; 4963 return 0;
4972 } 4964 }
4973 4965
4974 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4966 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4975 { 4967 {
4976 kfree(mem->info.nodeinfo[node]); 4968 kfree(mem->info.nodeinfo[node]);
4977 } 4969 }
4978 4970
4979 static struct mem_cgroup *mem_cgroup_alloc(void) 4971 static struct mem_cgroup *mem_cgroup_alloc(void)
4980 { 4972 {
4981 struct mem_cgroup *mem; 4973 struct mem_cgroup *mem;
4982 int size = sizeof(struct mem_cgroup); 4974 int size = sizeof(struct mem_cgroup);
4983 4975
4984 /* Can be very big if MAX_NUMNODES is very big */ 4976 /* Can be very big if MAX_NUMNODES is very big */
4985 if (size < PAGE_SIZE) 4977 if (size < PAGE_SIZE)
4986 mem = kzalloc(size, GFP_KERNEL); 4978 mem = kzalloc(size, GFP_KERNEL);
4987 else 4979 else
4988 mem = vzalloc(size); 4980 mem = vzalloc(size);
4989 4981
4990 if (!mem) 4982 if (!mem)
4991 return NULL; 4983 return NULL;
4992 4984
4993 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4985 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4994 if (!mem->stat) 4986 if (!mem->stat)
4995 goto out_free; 4987 goto out_free;
4996 spin_lock_init(&mem->pcp_counter_lock); 4988 spin_lock_init(&mem->pcp_counter_lock);
4997 return mem; 4989 return mem;
4998 4990
4999 out_free: 4991 out_free:
5000 if (size < PAGE_SIZE) 4992 if (size < PAGE_SIZE)
5001 kfree(mem); 4993 kfree(mem);
5002 else 4994 else
5003 vfree(mem); 4995 vfree(mem);
5004 return NULL; 4996 return NULL;
5005 } 4997 }
5006 4998
5007 /* 4999 /*
5008 * At destroying mem_cgroup, references from swap_cgroup can remain. 5000 * At destroying mem_cgroup, references from swap_cgroup can remain.
5009 * (scanning all at force_empty is too costly...) 5001 * (scanning all at force_empty is too costly...)
5010 * 5002 *
5011 * Instead of clearing all references at force_empty, we remember 5003 * Instead of clearing all references at force_empty, we remember
5012 * the number of reference from swap_cgroup and free mem_cgroup when 5004 * the number of reference from swap_cgroup and free mem_cgroup when
5013 * it goes down to 0. 5005 * it goes down to 0.
5014 * 5006 *
5015 * Removal of cgroup itself succeeds regardless of refs from swap. 5007 * Removal of cgroup itself succeeds regardless of refs from swap.
5016 */ 5008 */
5017 5009
5018 static void __mem_cgroup_free(struct mem_cgroup *mem) 5010 static void __mem_cgroup_free(struct mem_cgroup *mem)
5019 { 5011 {
5020 int node; 5012 int node;
5021 5013
5022 mem_cgroup_remove_from_trees(mem); 5014 mem_cgroup_remove_from_trees(mem);
5023 free_css_id(&mem_cgroup_subsys, &mem->css); 5015 free_css_id(&mem_cgroup_subsys, &mem->css);
5024 5016
5025 for_each_node_state(node, N_POSSIBLE) 5017 for_each_node_state(node, N_POSSIBLE)
5026 free_mem_cgroup_per_zone_info(mem, node); 5018 free_mem_cgroup_per_zone_info(mem, node);
5027 5019
5028 free_percpu(mem->stat); 5020 free_percpu(mem->stat);
5029 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 5021 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
5030 kfree(mem); 5022 kfree(mem);
5031 else 5023 else
5032 vfree(mem); 5024 vfree(mem);
5033 } 5025 }
5034 5026
5035 static void mem_cgroup_get(struct mem_cgroup *mem) 5027 static void mem_cgroup_get(struct mem_cgroup *mem)
5036 { 5028 {
5037 atomic_inc(&mem->refcnt); 5029 atomic_inc(&mem->refcnt);
5038 } 5030 }
5039 5031
5040 static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 5032 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
5041 { 5033 {
5042 if (atomic_sub_and_test(count, &mem->refcnt)) { 5034 if (atomic_sub_and_test(count, &mem->refcnt)) {
5043 struct mem_cgroup *parent = parent_mem_cgroup(mem); 5035 struct mem_cgroup *parent = parent_mem_cgroup(mem);
5044 __mem_cgroup_free(mem); 5036 __mem_cgroup_free(mem);
5045 if (parent) 5037 if (parent)
5046 mem_cgroup_put(parent); 5038 mem_cgroup_put(parent);
5047 } 5039 }
5048 } 5040 }
5049 5041
5050 static void mem_cgroup_put(struct mem_cgroup *mem) 5042 static void mem_cgroup_put(struct mem_cgroup *mem)
5051 { 5043 {
5052 __mem_cgroup_put(mem, 1); 5044 __mem_cgroup_put(mem, 1);
5053 } 5045 }
5054 5046
5055 /* 5047 /*
5056 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 5048 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
5057 */ 5049 */
5058 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 5050 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
5059 { 5051 {
5060 if (!mem->res.parent) 5052 if (!mem->res.parent)
5061 return NULL; 5053 return NULL;
5062 return mem_cgroup_from_res_counter(mem->res.parent, res); 5054 return mem_cgroup_from_res_counter(mem->res.parent, res);
5063 } 5055 }
5064 5056
5065 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5057 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5066 static void __init enable_swap_cgroup(void) 5058 static void __init enable_swap_cgroup(void)
5067 { 5059 {
5068 if (!mem_cgroup_disabled() && really_do_swap_account) 5060 if (!mem_cgroup_disabled() && really_do_swap_account)
5069 do_swap_account = 1; 5061 do_swap_account = 1;
5070 } 5062 }
5071 #else 5063 #else
5072 static void __init enable_swap_cgroup(void) 5064 static void __init enable_swap_cgroup(void)
5073 { 5065 {
5074 } 5066 }
5075 #endif 5067 #endif
5076 5068
5077 static int mem_cgroup_soft_limit_tree_init(void) 5069 static int mem_cgroup_soft_limit_tree_init(void)
5078 { 5070 {
5079 struct mem_cgroup_tree_per_node *rtpn; 5071 struct mem_cgroup_tree_per_node *rtpn;
5080 struct mem_cgroup_tree_per_zone *rtpz; 5072 struct mem_cgroup_tree_per_zone *rtpz;
5081 int tmp, node, zone; 5073 int tmp, node, zone;
5082 5074
5083 for_each_node_state(node, N_POSSIBLE) { 5075 for_each_node_state(node, N_POSSIBLE) {
5084 tmp = node; 5076 tmp = node;
5085 if (!node_state(node, N_NORMAL_MEMORY)) 5077 if (!node_state(node, N_NORMAL_MEMORY))
5086 tmp = -1; 5078 tmp = -1;
5087 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 5079 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
5088 if (!rtpn) 5080 if (!rtpn)
5089 return 1; 5081 return 1;
5090 5082
5091 soft_limit_tree.rb_tree_per_node[node] = rtpn; 5083 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5092 5084
5093 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 5085 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5094 rtpz = &rtpn->rb_tree_per_zone[zone]; 5086 rtpz = &rtpn->rb_tree_per_zone[zone];
5095 rtpz->rb_root = RB_ROOT; 5087 rtpz->rb_root = RB_ROOT;
5096 spin_lock_init(&rtpz->lock); 5088 spin_lock_init(&rtpz->lock);
5097 } 5089 }
5098 } 5090 }
5099 return 0; 5091 return 0;
5100 } 5092 }
5101 5093
5102 static struct cgroup_subsys_state * __ref 5094 static struct cgroup_subsys_state * __ref
5103 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 5095 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
5104 { 5096 {
5105 struct mem_cgroup *mem, *parent; 5097 struct mem_cgroup *mem, *parent;
5106 long error = -ENOMEM; 5098 long error = -ENOMEM;
5107 int node; 5099 int node;
5108 5100
5109 mem = mem_cgroup_alloc(); 5101 mem = mem_cgroup_alloc();
5110 if (!mem) 5102 if (!mem)
5111 return ERR_PTR(error); 5103 return ERR_PTR(error);
5112 5104
5113 for_each_node_state(node, N_POSSIBLE) 5105 for_each_node_state(node, N_POSSIBLE)
5114 if (alloc_mem_cgroup_per_zone_info(mem, node)) 5106 if (alloc_mem_cgroup_per_zone_info(mem, node))
5115 goto free_out; 5107 goto free_out;
5116 5108
5117 /* root ? */ 5109 /* root ? */
5118 if (cont->parent == NULL) { 5110 if (cont->parent == NULL) {
5119 int cpu; 5111 int cpu;
5120 enable_swap_cgroup(); 5112 enable_swap_cgroup();
5121 parent = NULL; 5113 parent = NULL;
5122 root_mem_cgroup = mem; 5114 root_mem_cgroup = mem;
5123 if (mem_cgroup_soft_limit_tree_init()) 5115 if (mem_cgroup_soft_limit_tree_init())
5124 goto free_out; 5116 goto free_out;
5125 for_each_possible_cpu(cpu) { 5117 for_each_possible_cpu(cpu) {
5126 struct memcg_stock_pcp *stock = 5118 struct memcg_stock_pcp *stock =
5127 &per_cpu(memcg_stock, cpu); 5119 &per_cpu(memcg_stock, cpu);
5128 INIT_WORK(&stock->work, drain_local_stock); 5120 INIT_WORK(&stock->work, drain_local_stock);
5129 } 5121 }
5130 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 5122 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5131 } else { 5123 } else {
5132 parent = mem_cgroup_from_cont(cont->parent); 5124 parent = mem_cgroup_from_cont(cont->parent);
5133 mem->use_hierarchy = parent->use_hierarchy; 5125 mem->use_hierarchy = parent->use_hierarchy;
5134 mem->oom_kill_disable = parent->oom_kill_disable; 5126 mem->oom_kill_disable = parent->oom_kill_disable;
5135 } 5127 }
5136 5128
5137 if (parent && parent->use_hierarchy) { 5129 if (parent && parent->use_hierarchy) {
5138 res_counter_init(&mem->res, &parent->res); 5130 res_counter_init(&mem->res, &parent->res);
5139 res_counter_init(&mem->memsw, &parent->memsw); 5131 res_counter_init(&mem->memsw, &parent->memsw);
5140 /* 5132 /*
5141 * We increment refcnt of the parent to ensure that we can 5133 * We increment refcnt of the parent to ensure that we can
5142 * safely access it on res_counter_charge/uncharge. 5134 * safely access it on res_counter_charge/uncharge.
5143 * This refcnt will be decremented when freeing this 5135 * This refcnt will be decremented when freeing this
5144 * mem_cgroup(see mem_cgroup_put). 5136 * mem_cgroup(see mem_cgroup_put).
5145 */ 5137 */
5146 mem_cgroup_get(parent); 5138 mem_cgroup_get(parent);
5147 } else { 5139 } else {
5148 res_counter_init(&mem->res, NULL); 5140 res_counter_init(&mem->res, NULL);
5149 res_counter_init(&mem->memsw, NULL); 5141 res_counter_init(&mem->memsw, NULL);
5150 } 5142 }
5151 mem->last_scanned_child = 0; 5143 mem->last_scanned_child = 0;
5152 mem->last_scanned_node = MAX_NUMNODES; 5144 mem->last_scanned_node = MAX_NUMNODES;
5153 INIT_LIST_HEAD(&mem->oom_notify); 5145 INIT_LIST_HEAD(&mem->oom_notify);
5154 5146
5155 if (parent) 5147 if (parent)
5156 mem->swappiness = mem_cgroup_swappiness(parent); 5148 mem->swappiness = mem_cgroup_swappiness(parent);
5157 atomic_set(&mem->refcnt, 1); 5149 atomic_set(&mem->refcnt, 1);
5158 mem->move_charge_at_immigrate = 0; 5150 mem->move_charge_at_immigrate = 0;
5159 mutex_init(&mem->thresholds_lock); 5151 mutex_init(&mem->thresholds_lock);
5160 spin_lock_init(&mem->scanstat.lock); 5152 spin_lock_init(&mem->scanstat.lock);
5161 return &mem->css; 5153 return &mem->css;
5162 free_out: 5154 free_out:
5163 __mem_cgroup_free(mem); 5155 __mem_cgroup_free(mem);
5164 root_mem_cgroup = NULL; 5156 root_mem_cgroup = NULL;
5165 return ERR_PTR(error); 5157 return ERR_PTR(error);
5166 } 5158 }
5167 5159
5168 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 5160 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
5169 struct cgroup *cont) 5161 struct cgroup *cont)
5170 { 5162 {
5171 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 5163 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
5172 5164
5173 return mem_cgroup_force_empty(mem, false); 5165 return mem_cgroup_force_empty(mem, false);
5174 } 5166 }
5175 5167
5176 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 5168 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
5177 struct cgroup *cont) 5169 struct cgroup *cont)
5178 { 5170 {
5179 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 5171 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
5180 5172
5181 mem_cgroup_put(mem); 5173 mem_cgroup_put(mem);
5182 } 5174 }
5183 5175
5184 static int mem_cgroup_populate(struct cgroup_subsys *ss, 5176 static int mem_cgroup_populate(struct cgroup_subsys *ss,
5185 struct cgroup *cont) 5177 struct cgroup *cont)
5186 { 5178 {
5187 int ret; 5179 int ret;
5188 5180
5189 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 5181 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
5190 ARRAY_SIZE(mem_cgroup_files)); 5182 ARRAY_SIZE(mem_cgroup_files));
5191 5183
5192 if (!ret) 5184 if (!ret)
5193 ret = register_memsw_files(cont, ss); 5185 ret = register_memsw_files(cont, ss);
5194 return ret; 5186 return ret;
5195 } 5187 }
5196 5188
5197 #ifdef CONFIG_MMU 5189 #ifdef CONFIG_MMU
5198 /* Handlers for move charge at task migration. */ 5190 /* Handlers for move charge at task migration. */
5199 #define PRECHARGE_COUNT_AT_ONCE 256 5191 #define PRECHARGE_COUNT_AT_ONCE 256
5200 static int mem_cgroup_do_precharge(unsigned long count) 5192 static int mem_cgroup_do_precharge(unsigned long count)
5201 { 5193 {
5202 int ret = 0; 5194 int ret = 0;
5203 int batch_count = PRECHARGE_COUNT_AT_ONCE; 5195 int batch_count = PRECHARGE_COUNT_AT_ONCE;
5204 struct mem_cgroup *mem = mc.to; 5196 struct mem_cgroup *mem = mc.to;
5205 5197
5206 if (mem_cgroup_is_root(mem)) { 5198 if (mem_cgroup_is_root(mem)) {
5207 mc.precharge += count; 5199 mc.precharge += count;
5208 /* we don't need css_get for root */ 5200 /* we don't need css_get for root */
5209 return ret; 5201 return ret;
5210 } 5202 }
5211 /* try to charge at once */ 5203 /* try to charge at once */
5212 if (count > 1) { 5204 if (count > 1) {
5213 struct res_counter *dummy; 5205 struct res_counter *dummy;
5214 /* 5206 /*
5215 * "mem" cannot be under rmdir() because we've already checked 5207 * "mem" cannot be under rmdir() because we've already checked
5216 * by cgroup_lock_live_cgroup() that it is not removed and we 5208 * by cgroup_lock_live_cgroup() that it is not removed and we
5217 * are still under the same cgroup_mutex. So we can postpone 5209 * are still under the same cgroup_mutex. So we can postpone
5218 * css_get(). 5210 * css_get().
5219 */ 5211 */
5220 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 5212 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
5221 goto one_by_one; 5213 goto one_by_one;
5222 if (do_swap_account && res_counter_charge(&mem->memsw, 5214 if (do_swap_account && res_counter_charge(&mem->memsw,
5223 PAGE_SIZE * count, &dummy)) { 5215 PAGE_SIZE * count, &dummy)) {
5224 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 5216 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
5225 goto one_by_one; 5217 goto one_by_one;
5226 } 5218 }
5227 mc.precharge += count; 5219 mc.precharge += count;
5228 return ret; 5220 return ret;
5229 } 5221 }
5230 one_by_one: 5222 one_by_one:
5231 /* fall back to one by one charge */ 5223 /* fall back to one by one charge */
5232 while (count--) { 5224 while (count--) {
5233 if (signal_pending(current)) { 5225 if (signal_pending(current)) {
5234 ret = -EINTR; 5226 ret = -EINTR;
5235 break; 5227 break;
5236 } 5228 }
5237 if (!batch_count--) { 5229 if (!batch_count--) {
5238 batch_count = PRECHARGE_COUNT_AT_ONCE; 5230 batch_count = PRECHARGE_COUNT_AT_ONCE;
5239 cond_resched(); 5231 cond_resched();
5240 } 5232 }
5241 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); 5233 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
5242 if (ret || !mem) 5234 if (ret || !mem)
5243 /* mem_cgroup_clear_mc() will do uncharge later */ 5235 /* mem_cgroup_clear_mc() will do uncharge later */
5244 return -ENOMEM; 5236 return -ENOMEM;
5245 mc.precharge++; 5237 mc.precharge++;
5246 } 5238 }
5247 return ret; 5239 return ret;
5248 } 5240 }
5249 5241
5250 /** 5242 /**
5251 * is_target_pte_for_mc - check a pte whether it is valid for move charge 5243 * is_target_pte_for_mc - check a pte whether it is valid for move charge
5252 * @vma: the vma the pte to be checked belongs 5244 * @vma: the vma the pte to be checked belongs
5253 * @addr: the address corresponding to the pte to be checked 5245 * @addr: the address corresponding to the pte to be checked
5254 * @ptent: the pte to be checked 5246 * @ptent: the pte to be checked
5255 * @target: the pointer the target page or swap ent will be stored(can be NULL) 5247 * @target: the pointer the target page or swap ent will be stored(can be NULL)
5256 * 5248 *
5257 * Returns 5249 * Returns
5258 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 5250 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
5259 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 5251 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5260 * move charge. if @target is not NULL, the page is stored in target->page 5252 * move charge. if @target is not NULL, the page is stored in target->page
5261 * with extra refcnt got(Callers should handle it). 5253 * with extra refcnt got(Callers should handle it).
5262 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 5254 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5263 * target for charge migration. if @target is not NULL, the entry is stored 5255 * target for charge migration. if @target is not NULL, the entry is stored
5264 * in target->ent. 5256 * in target->ent.
5265 * 5257 *
5266 * Called with pte lock held. 5258 * Called with pte lock held.
5267 */ 5259 */
5268 union mc_target { 5260 union mc_target {
5269 struct page *page; 5261 struct page *page;
5270 swp_entry_t ent; 5262 swp_entry_t ent;
5271 }; 5263 };
5272 5264
5273 enum mc_target_type { 5265 enum mc_target_type {
5274 MC_TARGET_NONE, /* not used */ 5266 MC_TARGET_NONE, /* not used */
5275 MC_TARGET_PAGE, 5267 MC_TARGET_PAGE,
5276 MC_TARGET_SWAP, 5268 MC_TARGET_SWAP,
5277 }; 5269 };
5278 5270
5279 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 5271 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5280 unsigned long addr, pte_t ptent) 5272 unsigned long addr, pte_t ptent)
5281 { 5273 {
5282 struct page *page = vm_normal_page(vma, addr, ptent); 5274 struct page *page = vm_normal_page(vma, addr, ptent);
5283 5275
5284 if (!page || !page_mapped(page)) 5276 if (!page || !page_mapped(page))
5285 return NULL; 5277 return NULL;
5286 if (PageAnon(page)) { 5278 if (PageAnon(page)) {
5287 /* we don't move shared anon */ 5279 /* we don't move shared anon */
5288 if (!move_anon() || page_mapcount(page) > 2) 5280 if (!move_anon() || page_mapcount(page) > 2)
5289 return NULL; 5281 return NULL;
5290 } else if (!move_file()) 5282 } else if (!move_file())
5291 /* we ignore mapcount for file pages */ 5283 /* we ignore mapcount for file pages */
5292 return NULL; 5284 return NULL;
5293 if (!get_page_unless_zero(page)) 5285 if (!get_page_unless_zero(page))
5294 return NULL; 5286 return NULL;
5295 5287
5296 return page; 5288 return page;
5297 } 5289 }
5298 5290
5299 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5291 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5300 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5292 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5301 { 5293 {
5302 int usage_count; 5294 int usage_count;
5303 struct page *page = NULL; 5295 struct page *page = NULL;
5304 swp_entry_t ent = pte_to_swp_entry(ptent); 5296 swp_entry_t ent = pte_to_swp_entry(ptent);
5305 5297
5306 if (!move_anon() || non_swap_entry(ent)) 5298 if (!move_anon() || non_swap_entry(ent))
5307 return NULL; 5299 return NULL;
5308 usage_count = mem_cgroup_count_swap_user(ent, &page); 5300 usage_count = mem_cgroup_count_swap_user(ent, &page);
5309 if (usage_count > 1) { /* we don't move shared anon */ 5301 if (usage_count > 1) { /* we don't move shared anon */
5310 if (page) 5302 if (page)
5311 put_page(page); 5303 put_page(page);
5312 return NULL; 5304 return NULL;
5313 } 5305 }
5314 if (do_swap_account) 5306 if (do_swap_account)
5315 entry->val = ent.val; 5307 entry->val = ent.val;
5316 5308
5317 return page; 5309 return page;
5318 } 5310 }
5319 5311
5320 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5312 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5321 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5313 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5322 { 5314 {
5323 struct page *page = NULL; 5315 struct page *page = NULL;
5324 struct inode *inode; 5316 struct inode *inode;
5325 struct address_space *mapping; 5317 struct address_space *mapping;
5326 pgoff_t pgoff; 5318 pgoff_t pgoff;
5327 5319
5328 if (!vma->vm_file) /* anonymous vma */ 5320 if (!vma->vm_file) /* anonymous vma */
5329 return NULL; 5321 return NULL;
5330 if (!move_file()) 5322 if (!move_file())
5331 return NULL; 5323 return NULL;
5332 5324
5333 inode = vma->vm_file->f_path.dentry->d_inode; 5325 inode = vma->vm_file->f_path.dentry->d_inode;
5334 mapping = vma->vm_file->f_mapping; 5326 mapping = vma->vm_file->f_mapping;
5335 if (pte_none(ptent)) 5327 if (pte_none(ptent))
5336 pgoff = linear_page_index(vma, addr); 5328 pgoff = linear_page_index(vma, addr);
5337 else /* pte_file(ptent) is true */ 5329 else /* pte_file(ptent) is true */
5338 pgoff = pte_to_pgoff(ptent); 5330 pgoff = pte_to_pgoff(ptent);
5339 5331
5340 /* page is moved even if it's not RSS of this task(page-faulted). */ 5332 /* page is moved even if it's not RSS of this task(page-faulted). */
5341 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 5333 if (!mapping_cap_swap_backed(mapping)) { /* normal file */
5342 page = find_get_page(mapping, pgoff); 5334 page = find_get_page(mapping, pgoff);
5343 } else { /* shmem/tmpfs file. we should take account of swap too. */ 5335 } else { /* shmem/tmpfs file. we should take account of swap too. */
5344 swp_entry_t ent; 5336 swp_entry_t ent;
5345 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 5337 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
5346 if (do_swap_account) 5338 if (do_swap_account)
5347 entry->val = ent.val; 5339 entry->val = ent.val;
5348 } 5340 }
5349 5341
5350 return page; 5342 return page;
5351 } 5343 }
5352 5344
5353 static int is_target_pte_for_mc(struct vm_area_struct *vma, 5345 static int is_target_pte_for_mc(struct vm_area_struct *vma,
5354 unsigned long addr, pte_t ptent, union mc_target *target) 5346 unsigned long addr, pte_t ptent, union mc_target *target)
5355 { 5347 {
5356 struct page *page = NULL; 5348 struct page *page = NULL;
5357 struct page_cgroup *pc; 5349 struct page_cgroup *pc;
5358 int ret = 0; 5350 int ret = 0;
5359 swp_entry_t ent = { .val = 0 }; 5351 swp_entry_t ent = { .val = 0 };
5360 5352
5361 if (pte_present(ptent)) 5353 if (pte_present(ptent))
5362 page = mc_handle_present_pte(vma, addr, ptent); 5354 page = mc_handle_present_pte(vma, addr, ptent);
5363 else if (is_swap_pte(ptent)) 5355 else if (is_swap_pte(ptent))
5364 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 5356 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
5365 else if (pte_none(ptent) || pte_file(ptent)) 5357 else if (pte_none(ptent) || pte_file(ptent))
5366 page = mc_handle_file_pte(vma, addr, ptent, &ent); 5358 page = mc_handle_file_pte(vma, addr, ptent, &ent);
5367 5359
5368 if (!page && !ent.val) 5360 if (!page && !ent.val)
5369 return 0; 5361 return 0;
5370 if (page) { 5362 if (page) {
5371 pc = lookup_page_cgroup(page); 5363 pc = lookup_page_cgroup(page);
5372 /* 5364 /*
5373 * Do only loose check w/o page_cgroup lock. 5365 * Do only loose check w/o page_cgroup lock.
5374 * mem_cgroup_move_account() checks the pc is valid or not under 5366 * mem_cgroup_move_account() checks the pc is valid or not under
5375 * the lock. 5367 * the lock.
5376 */ 5368 */
5377 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 5369 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
5378 ret = MC_TARGET_PAGE; 5370 ret = MC_TARGET_PAGE;
5379 if (target) 5371 if (target)
5380 target->page = page; 5372 target->page = page;
5381 } 5373 }
5382 if (!ret || !target) 5374 if (!ret || !target)
5383 put_page(page); 5375 put_page(page);
5384 } 5376 }
5385 /* There is a swap entry and a page doesn't exist or isn't charged */ 5377 /* There is a swap entry and a page doesn't exist or isn't charged */
5386 if (ent.val && !ret && 5378 if (ent.val && !ret &&
5387 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 5379 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
5388 ret = MC_TARGET_SWAP; 5380 ret = MC_TARGET_SWAP;
5389 if (target) 5381 if (target)
5390 target->ent = ent; 5382 target->ent = ent;
5391 } 5383 }
5392 return ret; 5384 return ret;
5393 } 5385 }
5394 5386
5395 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 5387 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5396 unsigned long addr, unsigned long end, 5388 unsigned long addr, unsigned long end,
5397 struct mm_walk *walk) 5389 struct mm_walk *walk)
5398 { 5390 {
5399 struct vm_area_struct *vma = walk->private; 5391 struct vm_area_struct *vma = walk->private;
5400 pte_t *pte; 5392 pte_t *pte;
5401 spinlock_t *ptl; 5393 spinlock_t *ptl;
5402 5394
5403 split_huge_page_pmd(walk->mm, pmd); 5395 split_huge_page_pmd(walk->mm, pmd);
5404 5396
5405 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5397 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5406 for (; addr != end; pte++, addr += PAGE_SIZE) 5398 for (; addr != end; pte++, addr += PAGE_SIZE)
5407 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 5399 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
5408 mc.precharge++; /* increment precharge temporarily */ 5400 mc.precharge++; /* increment precharge temporarily */
5409 pte_unmap_unlock(pte - 1, ptl); 5401 pte_unmap_unlock(pte - 1, ptl);
5410 cond_resched(); 5402 cond_resched();
5411 5403
5412 return 0; 5404 return 0;
5413 } 5405 }
5414 5406
5415 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 5407 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5416 { 5408 {
5417 unsigned long precharge; 5409 unsigned long precharge;
5418 struct vm_area_struct *vma; 5410 struct vm_area_struct *vma;
5419 5411
5420 down_read(&mm->mmap_sem); 5412 down_read(&mm->mmap_sem);
5421 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5413 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5422 struct mm_walk mem_cgroup_count_precharge_walk = { 5414 struct mm_walk mem_cgroup_count_precharge_walk = {
5423 .pmd_entry = mem_cgroup_count_precharge_pte_range, 5415 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5424 .mm = mm, 5416 .mm = mm,
5425 .private = vma, 5417 .private = vma,
5426 }; 5418 };
5427 if (is_vm_hugetlb_page(vma)) 5419 if (is_vm_hugetlb_page(vma))
5428 continue; 5420 continue;
5429 walk_page_range(vma->vm_start, vma->vm_end, 5421 walk_page_range(vma->vm_start, vma->vm_end,
5430 &mem_cgroup_count_precharge_walk); 5422 &mem_cgroup_count_precharge_walk);
5431 } 5423 }
5432 up_read(&mm->mmap_sem); 5424 up_read(&mm->mmap_sem);
5433 5425
5434 precharge = mc.precharge; 5426 precharge = mc.precharge;
5435 mc.precharge = 0; 5427 mc.precharge = 0;
5436 5428
5437 return precharge; 5429 return precharge;
5438 } 5430 }
5439 5431
5440 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 5432 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5441 { 5433 {
5442 unsigned long precharge = mem_cgroup_count_precharge(mm); 5434 unsigned long precharge = mem_cgroup_count_precharge(mm);
5443 5435
5444 VM_BUG_ON(mc.moving_task); 5436 VM_BUG_ON(mc.moving_task);
5445 mc.moving_task = current; 5437 mc.moving_task = current;
5446 return mem_cgroup_do_precharge(precharge); 5438 return mem_cgroup_do_precharge(precharge);
5447 } 5439 }
5448 5440
5449 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 5441 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5450 static void __mem_cgroup_clear_mc(void) 5442 static void __mem_cgroup_clear_mc(void)
5451 { 5443 {
5452 struct mem_cgroup *from = mc.from; 5444 struct mem_cgroup *from = mc.from;
5453 struct mem_cgroup *to = mc.to; 5445 struct mem_cgroup *to = mc.to;
5454 5446
5455 /* we must uncharge all the leftover precharges from mc.to */ 5447 /* we must uncharge all the leftover precharges from mc.to */
5456 if (mc.precharge) { 5448 if (mc.precharge) {
5457 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 5449 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
5458 mc.precharge = 0; 5450 mc.precharge = 0;
5459 } 5451 }
5460 /* 5452 /*
5461 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 5453 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5462 * we must uncharge here. 5454 * we must uncharge here.
5463 */ 5455 */
5464 if (mc.moved_charge) { 5456 if (mc.moved_charge) {
5465 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 5457 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
5466 mc.moved_charge = 0; 5458 mc.moved_charge = 0;
5467 } 5459 }
5468 /* we must fixup refcnts and charges */ 5460 /* we must fixup refcnts and charges */
5469 if (mc.moved_swap) { 5461 if (mc.moved_swap) {
5470 /* uncharge swap account from the old cgroup */ 5462 /* uncharge swap account from the old cgroup */
5471 if (!mem_cgroup_is_root(mc.from)) 5463 if (!mem_cgroup_is_root(mc.from))
5472 res_counter_uncharge(&mc.from->memsw, 5464 res_counter_uncharge(&mc.from->memsw,
5473 PAGE_SIZE * mc.moved_swap); 5465 PAGE_SIZE * mc.moved_swap);
5474 __mem_cgroup_put(mc.from, mc.moved_swap); 5466 __mem_cgroup_put(mc.from, mc.moved_swap);
5475 5467
5476 if (!mem_cgroup_is_root(mc.to)) { 5468 if (!mem_cgroup_is_root(mc.to)) {
5477 /* 5469 /*
5478 * we charged both to->res and to->memsw, so we should 5470 * we charged both to->res and to->memsw, so we should
5479 * uncharge to->res. 5471 * uncharge to->res.
5480 */ 5472 */
5481 res_counter_uncharge(&mc.to->res, 5473 res_counter_uncharge(&mc.to->res,
5482 PAGE_SIZE * mc.moved_swap); 5474 PAGE_SIZE * mc.moved_swap);
5483 } 5475 }
5484 /* we've already done mem_cgroup_get(mc.to) */ 5476 /* we've already done mem_cgroup_get(mc.to) */
5485 mc.moved_swap = 0; 5477 mc.moved_swap = 0;
5486 } 5478 }
5487 memcg_oom_recover(from); 5479 memcg_oom_recover(from);
5488 memcg_oom_recover(to); 5480 memcg_oom_recover(to);
5489 wake_up_all(&mc.waitq); 5481 wake_up_all(&mc.waitq);
5490 } 5482 }
5491 5483
5492 static void mem_cgroup_clear_mc(void) 5484 static void mem_cgroup_clear_mc(void)
5493 { 5485 {
5494 struct mem_cgroup *from = mc.from; 5486 struct mem_cgroup *from = mc.from;
5495 5487
5496 /* 5488 /*
5497 * we must clear moving_task before waking up waiters at the end of 5489 * we must clear moving_task before waking up waiters at the end of
5498 * task migration. 5490 * task migration.
5499 */ 5491 */
5500 mc.moving_task = NULL; 5492 mc.moving_task = NULL;
5501 __mem_cgroup_clear_mc(); 5493 __mem_cgroup_clear_mc();
5502 spin_lock(&mc.lock); 5494 spin_lock(&mc.lock);
5503 mc.from = NULL; 5495 mc.from = NULL;
5504 mc.to = NULL; 5496 mc.to = NULL;
5505 spin_unlock(&mc.lock); 5497 spin_unlock(&mc.lock);
5506 mem_cgroup_end_move(from); 5498 mem_cgroup_end_move(from);
5507 } 5499 }
5508 5500
5509 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5501 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5510 struct cgroup *cgroup, 5502 struct cgroup *cgroup,
5511 struct task_struct *p) 5503 struct task_struct *p)
5512 { 5504 {
5513 int ret = 0; 5505 int ret = 0;
5514 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 5506 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
5515 5507
5516 if (mem->move_charge_at_immigrate) { 5508 if (mem->move_charge_at_immigrate) {
5517 struct mm_struct *mm; 5509 struct mm_struct *mm;
5518 struct mem_cgroup *from = mem_cgroup_from_task(p); 5510 struct mem_cgroup *from = mem_cgroup_from_task(p);
5519 5511
5520 VM_BUG_ON(from == mem); 5512 VM_BUG_ON(from == mem);
5521 5513
5522 mm = get_task_mm(p); 5514 mm = get_task_mm(p);
5523 if (!mm) 5515 if (!mm)
5524 return 0; 5516 return 0;
5525 /* We move charges only when we move a owner of the mm */ 5517 /* We move charges only when we move a owner of the mm */
5526 if (mm->owner == p) { 5518 if (mm->owner == p) {
5527 VM_BUG_ON(mc.from); 5519 VM_BUG_ON(mc.from);
5528 VM_BUG_ON(mc.to); 5520 VM_BUG_ON(mc.to);
5529 VM_BUG_ON(mc.precharge); 5521 VM_BUG_ON(mc.precharge);
5530 VM_BUG_ON(mc.moved_charge); 5522 VM_BUG_ON(mc.moved_charge);
5531 VM_BUG_ON(mc.moved_swap); 5523 VM_BUG_ON(mc.moved_swap);
5532 mem_cgroup_start_move(from); 5524 mem_cgroup_start_move(from);
5533 spin_lock(&mc.lock); 5525 spin_lock(&mc.lock);
5534 mc.from = from; 5526 mc.from = from;
5535 mc.to = mem; 5527 mc.to = mem;
5536 spin_unlock(&mc.lock); 5528 spin_unlock(&mc.lock);
5537 /* We set mc.moving_task later */ 5529 /* We set mc.moving_task later */
5538 5530
5539 ret = mem_cgroup_precharge_mc(mm); 5531 ret = mem_cgroup_precharge_mc(mm);
5540 if (ret) 5532 if (ret)
5541 mem_cgroup_clear_mc(); 5533 mem_cgroup_clear_mc();
5542 } 5534 }
5543 mmput(mm); 5535 mmput(mm);
5544 } 5536 }
5545 return ret; 5537 return ret;
5546 } 5538 }
5547 5539
5548 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5540 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5549 struct cgroup *cgroup, 5541 struct cgroup *cgroup,
5550 struct task_struct *p) 5542 struct task_struct *p)
5551 { 5543 {
5552 mem_cgroup_clear_mc(); 5544 mem_cgroup_clear_mc();
5553 } 5545 }
5554 5546
5555 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5547 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5556 unsigned long addr, unsigned long end, 5548 unsigned long addr, unsigned long end,
5557 struct mm_walk *walk) 5549 struct mm_walk *walk)
5558 { 5550 {
5559 int ret = 0; 5551 int ret = 0;
5560 struct vm_area_struct *vma = walk->private; 5552 struct vm_area_struct *vma = walk->private;
5561 pte_t *pte; 5553 pte_t *pte;
5562 spinlock_t *ptl; 5554 spinlock_t *ptl;
5563 5555
5564 split_huge_page_pmd(walk->mm, pmd); 5556 split_huge_page_pmd(walk->mm, pmd);
5565 retry: 5557 retry:
5566 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5558 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5567 for (; addr != end; addr += PAGE_SIZE) { 5559 for (; addr != end; addr += PAGE_SIZE) {
5568 pte_t ptent = *(pte++); 5560 pte_t ptent = *(pte++);
5569 union mc_target target; 5561 union mc_target target;
5570 int type; 5562 int type;
5571 struct page *page; 5563 struct page *page;
5572 struct page_cgroup *pc; 5564 struct page_cgroup *pc;
5573 swp_entry_t ent; 5565 swp_entry_t ent;
5574 5566
5575 if (!mc.precharge) 5567 if (!mc.precharge)
5576 break; 5568 break;
5577 5569
5578 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5570 type = is_target_pte_for_mc(vma, addr, ptent, &target);
5579 switch (type) { 5571 switch (type) {
5580 case MC_TARGET_PAGE: 5572 case MC_TARGET_PAGE:
5581 page = target.page; 5573 page = target.page;
5582 if (isolate_lru_page(page)) 5574 if (isolate_lru_page(page))
5583 goto put; 5575 goto put;
5584 pc = lookup_page_cgroup(page); 5576 pc = lookup_page_cgroup(page);
5585 if (!mem_cgroup_move_account(page, 1, pc, 5577 if (!mem_cgroup_move_account(page, 1, pc,
5586 mc.from, mc.to, false)) { 5578 mc.from, mc.to, false)) {
5587 mc.precharge--; 5579 mc.precharge--;
5588 /* we uncharge from mc.from later. */ 5580 /* we uncharge from mc.from later. */
5589 mc.moved_charge++; 5581 mc.moved_charge++;
5590 } 5582 }
5591 putback_lru_page(page); 5583 putback_lru_page(page);
5592 put: /* is_target_pte_for_mc() gets the page */ 5584 put: /* is_target_pte_for_mc() gets the page */
5593 put_page(page); 5585 put_page(page);
5594 break; 5586 break;
5595 case MC_TARGET_SWAP: 5587 case MC_TARGET_SWAP:
5596 ent = target.ent; 5588 ent = target.ent;
5597 if (!mem_cgroup_move_swap_account(ent, 5589 if (!mem_cgroup_move_swap_account(ent,
5598 mc.from, mc.to, false)) { 5590 mc.from, mc.to, false)) {
5599 mc.precharge--; 5591 mc.precharge--;
5600 /* we fixup refcnts and charges later. */ 5592 /* we fixup refcnts and charges later. */
5601 mc.moved_swap++; 5593 mc.moved_swap++;
5602 } 5594 }
5603 break; 5595 break;
5604 default: 5596 default:
5605 break; 5597 break;
5606 } 5598 }
5607 } 5599 }
5608 pte_unmap_unlock(pte - 1, ptl); 5600 pte_unmap_unlock(pte - 1, ptl);
5609 cond_resched(); 5601 cond_resched();
5610 5602
5611 if (addr != end) { 5603 if (addr != end) {
5612 /* 5604 /*
5613 * We have consumed all precharges we got in can_attach(). 5605 * We have consumed all precharges we got in can_attach().
5614 * We try charge one by one, but don't do any additional 5606 * We try charge one by one, but don't do any additional
5615 * charges to mc.to if we have failed in charge once in attach() 5607 * charges to mc.to if we have failed in charge once in attach()
5616 * phase. 5608 * phase.
5617 */ 5609 */
5618 ret = mem_cgroup_do_precharge(1); 5610 ret = mem_cgroup_do_precharge(1);
5619 if (!ret) 5611 if (!ret)
5620 goto retry; 5612 goto retry;
5621 } 5613 }
5622 5614
5623 return ret; 5615 return ret;
5624 } 5616 }
5625 5617
5626 static void mem_cgroup_move_charge(struct mm_struct *mm) 5618 static void mem_cgroup_move_charge(struct mm_struct *mm)
5627 { 5619 {
5628 struct vm_area_struct *vma; 5620 struct vm_area_struct *vma;
5629 5621
5630 lru_add_drain_all(); 5622 lru_add_drain_all();
5631 retry: 5623 retry:
5632 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5624 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5633 /* 5625 /*
5634 * Someone who are holding the mmap_sem might be waiting in 5626 * Someone who are holding the mmap_sem might be waiting in
5635 * waitq. So we cancel all extra charges, wake up all waiters, 5627 * waitq. So we cancel all extra charges, wake up all waiters,
5636 * and retry. Because we cancel precharges, we might not be able 5628 * and retry. Because we cancel precharges, we might not be able
5637 * to move enough charges, but moving charge is a best-effort 5629 * to move enough charges, but moving charge is a best-effort
5638 * feature anyway, so it wouldn't be a big problem. 5630 * feature anyway, so it wouldn't be a big problem.
5639 */ 5631 */
5640 __mem_cgroup_clear_mc(); 5632 __mem_cgroup_clear_mc();
5641 cond_resched(); 5633 cond_resched();
5642 goto retry; 5634 goto retry;
5643 } 5635 }
5644 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5636 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5645 int ret; 5637 int ret;
5646 struct mm_walk mem_cgroup_move_charge_walk = { 5638 struct mm_walk mem_cgroup_move_charge_walk = {
5647 .pmd_entry = mem_cgroup_move_charge_pte_range, 5639 .pmd_entry = mem_cgroup_move_charge_pte_range,
5648 .mm = mm, 5640 .mm = mm,
5649 .private = vma, 5641 .private = vma,
5650 }; 5642 };
5651 if (is_vm_hugetlb_page(vma)) 5643 if (is_vm_hugetlb_page(vma))
5652 continue; 5644 continue;
5653 ret = walk_page_range(vma->vm_start, vma->vm_end, 5645 ret = walk_page_range(vma->vm_start, vma->vm_end,
5654 &mem_cgroup_move_charge_walk); 5646 &mem_cgroup_move_charge_walk);
5655 if (ret) 5647 if (ret)
5656 /* 5648 /*
5657 * means we have consumed all precharges and failed in 5649 * means we have consumed all precharges and failed in
5658 * doing additional charge. Just abandon here. 5650 * doing additional charge. Just abandon here.
5659 */ 5651 */
5660 break; 5652 break;
5661 } 5653 }
5662 up_read(&mm->mmap_sem); 5654 up_read(&mm->mmap_sem);
5663 } 5655 }
5664 5656
5665 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5657 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5666 struct cgroup *cont, 5658 struct cgroup *cont,
5667 struct cgroup *old_cont, 5659 struct cgroup *old_cont,
5668 struct task_struct *p) 5660 struct task_struct *p)
5669 { 5661 {
5670 struct mm_struct *mm = get_task_mm(p); 5662 struct mm_struct *mm = get_task_mm(p);
5671 5663
5672 if (mm) { 5664 if (mm) {
5673 if (mc.to) 5665 if (mc.to)
5674 mem_cgroup_move_charge(mm); 5666 mem_cgroup_move_charge(mm);
5675 put_swap_token(mm); 5667 put_swap_token(mm);
5676 mmput(mm); 5668 mmput(mm);
5677 } 5669 }
5678 if (mc.to) 5670 if (mc.to)
5679 mem_cgroup_clear_mc(); 5671 mem_cgroup_clear_mc();
5680 } 5672 }
5681 #else /* !CONFIG_MMU */ 5673 #else /* !CONFIG_MMU */
5682 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5674 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5683 struct cgroup *cgroup, 5675 struct cgroup *cgroup,
5684 struct task_struct *p) 5676 struct task_struct *p)
5685 { 5677 {
5686 return 0; 5678 return 0;
5687 } 5679 }
5688 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5680 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5689 struct cgroup *cgroup, 5681 struct cgroup *cgroup,
5690 struct task_struct *p) 5682 struct task_struct *p)
5691 { 5683 {
5692 } 5684 }
5693 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5685 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5694 struct cgroup *cont, 5686 struct cgroup *cont,
5695 struct cgroup *old_cont, 5687 struct cgroup *old_cont,
5696 struct task_struct *p) 5688 struct task_struct *p)
5697 { 5689 {
5698 } 5690 }
5699 #endif 5691 #endif
5700 5692
5701 struct cgroup_subsys mem_cgroup_subsys = { 5693 struct cgroup_subsys mem_cgroup_subsys = {
5702 .name = "memory", 5694 .name = "memory",
5703 .subsys_id = mem_cgroup_subsys_id, 5695 .subsys_id = mem_cgroup_subsys_id,
5704 .create = mem_cgroup_create, 5696 .create = mem_cgroup_create,
5705 .pre_destroy = mem_cgroup_pre_destroy, 5697 .pre_destroy = mem_cgroup_pre_destroy,
5706 .destroy = mem_cgroup_destroy, 5698 .destroy = mem_cgroup_destroy,
5707 .populate = mem_cgroup_populate, 5699 .populate = mem_cgroup_populate,
5708 .can_attach = mem_cgroup_can_attach, 5700 .can_attach = mem_cgroup_can_attach,
5709 .cancel_attach = mem_cgroup_cancel_attach, 5701 .cancel_attach = mem_cgroup_cancel_attach,
5710 .attach = mem_cgroup_move_task, 5702 .attach = mem_cgroup_move_task,
5711 .early_init = 0, 5703 .early_init = 0,
5712 .use_id = 1, 5704 .use_id = 1,
5713 }; 5705 };
5714 5706
5715 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5707 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5716 static int __init enable_swap_account(char *s) 5708 static int __init enable_swap_account(char *s)
5717 { 5709 {
5718 /* consider enabled if no parameter or 1 is given */ 5710 /* consider enabled if no parameter or 1 is given */
5719 if (!strcmp(s, "1")) 5711 if (!strcmp(s, "1"))
5720 really_do_swap_account = 1; 5712 really_do_swap_account = 1;
5721 else if (!strcmp(s, "0")) 5713 else if (!strcmp(s, "0"))
5722 really_do_swap_account = 0; 5714 really_do_swap_account = 0;
5723 return 1; 5715 return 1;
5724 } 5716 }
5725 __setup("swapaccount=", enable_swap_account); 5717 __setup("swapaccount=", enable_swap_account);
5726 5718
5727 #endif 5719 #endif