Commit 39cc98f1f8aa949afeea89f424c7494b0785d7da

Authored by Michal Hocko
Committed by Linus Torvalds
1 parent d149e3b25d

memcg: remove pointless next_mz nullification in mem_cgroup_soft_limit_reclaim()

next_mz is assigned to NULL if __mem_cgroup_largest_soft_limit_node
selects the same mz.  This doesn't make much sense as we assign to the
variable right in the next loop.

Compiler will probably optimize this out but it is little bit confusing
for the code reading.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 2 additions and 3 deletions Inline Diff

1 /* memcontrol.c - Memory Controller 1 /* memcontrol.c - Memory Controller
2 * 2 *
3 * Copyright IBM Corporation, 2007 3 * Copyright IBM Corporation, 2007
4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
5 * 5 *
6 * Copyright 2007 OpenVZ SWsoft Inc 6 * Copyright 2007 OpenVZ SWsoft Inc
7 * Author: Pavel Emelianov <xemul@openvz.org> 7 * Author: Pavel Emelianov <xemul@openvz.org>
8 * 8 *
9 * Memory thresholds 9 * Memory thresholds
10 * Copyright (C) 2009 Nokia Corporation 10 * Copyright (C) 2009 Nokia Corporation
11 * Author: Kirill A. Shutemov 11 * Author: Kirill A. Shutemov
12 * 12 *
13 * This program is free software; you can redistribute it and/or modify 13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by 14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or 15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version. 16 * (at your option) any later version.
17 * 17 *
18 * This program is distributed in the hope that it will be useful, 18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of 19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details. 21 * GNU General Public License for more details.
22 */ 22 */
23 23
24 #include <linux/res_counter.h> 24 #include <linux/res_counter.h>
25 #include <linux/memcontrol.h> 25 #include <linux/memcontrol.h>
26 #include <linux/cgroup.h> 26 #include <linux/cgroup.h>
27 #include <linux/mm.h> 27 #include <linux/mm.h>
28 #include <linux/hugetlb.h> 28 #include <linux/hugetlb.h>
29 #include <linux/pagemap.h> 29 #include <linux/pagemap.h>
30 #include <linux/smp.h> 30 #include <linux/smp.h>
31 #include <linux/page-flags.h> 31 #include <linux/page-flags.h>
32 #include <linux/backing-dev.h> 32 #include <linux/backing-dev.h>
33 #include <linux/bit_spinlock.h> 33 #include <linux/bit_spinlock.h>
34 #include <linux/rcupdate.h> 34 #include <linux/rcupdate.h>
35 #include <linux/limits.h> 35 #include <linux/limits.h>
36 #include <linux/mutex.h> 36 #include <linux/mutex.h>
37 #include <linux/rbtree.h> 37 #include <linux/rbtree.h>
38 #include <linux/slab.h> 38 #include <linux/slab.h>
39 #include <linux/swap.h> 39 #include <linux/swap.h>
40 #include <linux/swapops.h> 40 #include <linux/swapops.h>
41 #include <linux/spinlock.h> 41 #include <linux/spinlock.h>
42 #include <linux/eventfd.h> 42 #include <linux/eventfd.h>
43 #include <linux/sort.h> 43 #include <linux/sort.h>
44 #include <linux/fs.h> 44 #include <linux/fs.h>
45 #include <linux/seq_file.h> 45 #include <linux/seq_file.h>
46 #include <linux/vmalloc.h> 46 #include <linux/vmalloc.h>
47 #include <linux/mm_inline.h> 47 #include <linux/mm_inline.h>
48 #include <linux/page_cgroup.h> 48 #include <linux/page_cgroup.h>
49 #include <linux/cpu.h> 49 #include <linux/cpu.h>
50 #include <linux/oom.h> 50 #include <linux/oom.h>
51 #include "internal.h" 51 #include "internal.h"
52 52
53 #include <asm/uaccess.h> 53 #include <asm/uaccess.h>
54 54
55 #include <trace/events/vmscan.h> 55 #include <trace/events/vmscan.h>
56 56
57 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 57 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
58 #define MEM_CGROUP_RECLAIM_RETRIES 5 58 #define MEM_CGROUP_RECLAIM_RETRIES 5
59 struct mem_cgroup *root_mem_cgroup __read_mostly; 59 struct mem_cgroup *root_mem_cgroup __read_mostly;
60 60
61 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 61 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
62 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 62 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
63 int do_swap_account __read_mostly; 63 int do_swap_account __read_mostly;
64 64
65 /* for remember boot option*/ 65 /* for remember boot option*/
66 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED 66 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
67 static int really_do_swap_account __initdata = 1; 67 static int really_do_swap_account __initdata = 1;
68 #else 68 #else
69 static int really_do_swap_account __initdata = 0; 69 static int really_do_swap_account __initdata = 0;
70 #endif 70 #endif
71 71
72 #else 72 #else
73 #define do_swap_account (0) 73 #define do_swap_account (0)
74 #endif 74 #endif
75 75
76 76
77 /* 77 /*
78 * Statistics for memory cgroup. 78 * Statistics for memory cgroup.
79 */ 79 */
80 enum mem_cgroup_stat_index { 80 enum mem_cgroup_stat_index {
81 /* 81 /*
82 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. 82 * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
83 */ 83 */
84 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ 84 MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
85 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 85 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
86 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 86 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
87 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 87 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
88 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ 88 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
89 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ 89 MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */
90 MEM_CGROUP_STAT_NSTATS, 90 MEM_CGROUP_STAT_NSTATS,
91 }; 91 };
92 92
93 enum mem_cgroup_events_index { 93 enum mem_cgroup_events_index {
94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 94 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 95 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ 96 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
97 MEM_CGROUP_EVENTS_NSTATS, 97 MEM_CGROUP_EVENTS_NSTATS,
98 }; 98 };
99 /* 99 /*
100 * Per memcg event counter is incremented at every pagein/pageout. With THP, 100 * Per memcg event counter is incremented at every pagein/pageout. With THP,
101 * it will be incremated by the number of pages. This counter is used for 101 * it will be incremated by the number of pages. This counter is used for
102 * for trigger some periodic events. This is straightforward and better 102 * for trigger some periodic events. This is straightforward and better
103 * than using jiffies etc. to handle periodic memcg event. 103 * than using jiffies etc. to handle periodic memcg event.
104 */ 104 */
105 enum mem_cgroup_events_target { 105 enum mem_cgroup_events_target {
106 MEM_CGROUP_TARGET_THRESH, 106 MEM_CGROUP_TARGET_THRESH,
107 MEM_CGROUP_TARGET_SOFTLIMIT, 107 MEM_CGROUP_TARGET_SOFTLIMIT,
108 MEM_CGROUP_NTARGETS, 108 MEM_CGROUP_NTARGETS,
109 }; 109 };
110 #define THRESHOLDS_EVENTS_TARGET (128) 110 #define THRESHOLDS_EVENTS_TARGET (128)
111 #define SOFTLIMIT_EVENTS_TARGET (1024) 111 #define SOFTLIMIT_EVENTS_TARGET (1024)
112 112
113 struct mem_cgroup_stat_cpu { 113 struct mem_cgroup_stat_cpu {
114 long count[MEM_CGROUP_STAT_NSTATS]; 114 long count[MEM_CGROUP_STAT_NSTATS];
115 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 115 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
116 unsigned long targets[MEM_CGROUP_NTARGETS]; 116 unsigned long targets[MEM_CGROUP_NTARGETS];
117 }; 117 };
118 118
119 /* 119 /*
120 * per-zone information in memory controller. 120 * per-zone information in memory controller.
121 */ 121 */
122 struct mem_cgroup_per_zone { 122 struct mem_cgroup_per_zone {
123 /* 123 /*
124 * spin_lock to protect the per cgroup LRU 124 * spin_lock to protect the per cgroup LRU
125 */ 125 */
126 struct list_head lists[NR_LRU_LISTS]; 126 struct list_head lists[NR_LRU_LISTS];
127 unsigned long count[NR_LRU_LISTS]; 127 unsigned long count[NR_LRU_LISTS];
128 128
129 struct zone_reclaim_stat reclaim_stat; 129 struct zone_reclaim_stat reclaim_stat;
130 struct rb_node tree_node; /* RB tree node */ 130 struct rb_node tree_node; /* RB tree node */
131 unsigned long long usage_in_excess;/* Set to the value by which */ 131 unsigned long long usage_in_excess;/* Set to the value by which */
132 /* the soft limit is exceeded*/ 132 /* the soft limit is exceeded*/
133 bool on_tree; 133 bool on_tree;
134 struct mem_cgroup *mem; /* Back pointer, we cannot */ 134 struct mem_cgroup *mem; /* Back pointer, we cannot */
135 /* use container_of */ 135 /* use container_of */
136 }; 136 };
137 /* Macro for accessing counter */ 137 /* Macro for accessing counter */
138 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 138 #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
139 139
140 struct mem_cgroup_per_node { 140 struct mem_cgroup_per_node {
141 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 141 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
142 }; 142 };
143 143
144 struct mem_cgroup_lru_info { 144 struct mem_cgroup_lru_info {
145 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; 145 struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
146 }; 146 };
147 147
148 /* 148 /*
149 * Cgroups above their limits are maintained in a RB-Tree, independent of 149 * Cgroups above their limits are maintained in a RB-Tree, independent of
150 * their hierarchy representation 150 * their hierarchy representation
151 */ 151 */
152 152
153 struct mem_cgroup_tree_per_zone { 153 struct mem_cgroup_tree_per_zone {
154 struct rb_root rb_root; 154 struct rb_root rb_root;
155 spinlock_t lock; 155 spinlock_t lock;
156 }; 156 };
157 157
158 struct mem_cgroup_tree_per_node { 158 struct mem_cgroup_tree_per_node {
159 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 159 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
160 }; 160 };
161 161
162 struct mem_cgroup_tree { 162 struct mem_cgroup_tree {
163 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 163 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
164 }; 164 };
165 165
166 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 166 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
167 167
168 struct mem_cgroup_threshold { 168 struct mem_cgroup_threshold {
169 struct eventfd_ctx *eventfd; 169 struct eventfd_ctx *eventfd;
170 u64 threshold; 170 u64 threshold;
171 }; 171 };
172 172
173 /* For threshold */ 173 /* For threshold */
174 struct mem_cgroup_threshold_ary { 174 struct mem_cgroup_threshold_ary {
175 /* An array index points to threshold just below usage. */ 175 /* An array index points to threshold just below usage. */
176 int current_threshold; 176 int current_threshold;
177 /* Size of entries[] */ 177 /* Size of entries[] */
178 unsigned int size; 178 unsigned int size;
179 /* Array of thresholds */ 179 /* Array of thresholds */
180 struct mem_cgroup_threshold entries[0]; 180 struct mem_cgroup_threshold entries[0];
181 }; 181 };
182 182
183 struct mem_cgroup_thresholds { 183 struct mem_cgroup_thresholds {
184 /* Primary thresholds array */ 184 /* Primary thresholds array */
185 struct mem_cgroup_threshold_ary *primary; 185 struct mem_cgroup_threshold_ary *primary;
186 /* 186 /*
187 * Spare threshold array. 187 * Spare threshold array.
188 * This is needed to make mem_cgroup_unregister_event() "never fail". 188 * This is needed to make mem_cgroup_unregister_event() "never fail".
189 * It must be able to store at least primary->size - 1 entries. 189 * It must be able to store at least primary->size - 1 entries.
190 */ 190 */
191 struct mem_cgroup_threshold_ary *spare; 191 struct mem_cgroup_threshold_ary *spare;
192 }; 192 };
193 193
194 /* for OOM */ 194 /* for OOM */
195 struct mem_cgroup_eventfd_list { 195 struct mem_cgroup_eventfd_list {
196 struct list_head list; 196 struct list_head list;
197 struct eventfd_ctx *eventfd; 197 struct eventfd_ctx *eventfd;
198 }; 198 };
199 199
200 static void mem_cgroup_threshold(struct mem_cgroup *mem); 200 static void mem_cgroup_threshold(struct mem_cgroup *mem);
201 static void mem_cgroup_oom_notify(struct mem_cgroup *mem); 201 static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
202 202
203 /* 203 /*
204 * The memory controller data structure. The memory controller controls both 204 * The memory controller data structure. The memory controller controls both
205 * page cache and RSS per cgroup. We would eventually like to provide 205 * page cache and RSS per cgroup. We would eventually like to provide
206 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 206 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
207 * to help the administrator determine what knobs to tune. 207 * to help the administrator determine what knobs to tune.
208 * 208 *
209 * TODO: Add a water mark for the memory controller. Reclaim will begin when 209 * TODO: Add a water mark for the memory controller. Reclaim will begin when
210 * we hit the water mark. May be even add a low water mark, such that 210 * we hit the water mark. May be even add a low water mark, such that
211 * no reclaim occurs from a cgroup at it's low water mark, this is 211 * no reclaim occurs from a cgroup at it's low water mark, this is
212 * a feature that will be implemented much later in the future. 212 * a feature that will be implemented much later in the future.
213 */ 213 */
214 struct mem_cgroup { 214 struct mem_cgroup {
215 struct cgroup_subsys_state css; 215 struct cgroup_subsys_state css;
216 /* 216 /*
217 * the counter to account for memory usage 217 * the counter to account for memory usage
218 */ 218 */
219 struct res_counter res; 219 struct res_counter res;
220 /* 220 /*
221 * the counter to account for mem+swap usage. 221 * the counter to account for mem+swap usage.
222 */ 222 */
223 struct res_counter memsw; 223 struct res_counter memsw;
224 /* 224 /*
225 * Per cgroup active and inactive list, similar to the 225 * Per cgroup active and inactive list, similar to the
226 * per zone LRU lists. 226 * per zone LRU lists.
227 */ 227 */
228 struct mem_cgroup_lru_info info; 228 struct mem_cgroup_lru_info info;
229 /* 229 /*
230 * While reclaiming in a hierarchy, we cache the last child we 230 * While reclaiming in a hierarchy, we cache the last child we
231 * reclaimed from. 231 * reclaimed from.
232 */ 232 */
233 int last_scanned_child; 233 int last_scanned_child;
234 /* 234 /*
235 * Should the accounting and control be hierarchical, per subtree? 235 * Should the accounting and control be hierarchical, per subtree?
236 */ 236 */
237 bool use_hierarchy; 237 bool use_hierarchy;
238 atomic_t oom_lock; 238 atomic_t oom_lock;
239 atomic_t refcnt; 239 atomic_t refcnt;
240 240
241 unsigned int swappiness; 241 unsigned int swappiness;
242 /* OOM-Killer disable */ 242 /* OOM-Killer disable */
243 int oom_kill_disable; 243 int oom_kill_disable;
244 244
245 /* set when res.limit == memsw.limit */ 245 /* set when res.limit == memsw.limit */
246 bool memsw_is_minimum; 246 bool memsw_is_minimum;
247 247
248 /* protect arrays of thresholds */ 248 /* protect arrays of thresholds */
249 struct mutex thresholds_lock; 249 struct mutex thresholds_lock;
250 250
251 /* thresholds for memory usage. RCU-protected */ 251 /* thresholds for memory usage. RCU-protected */
252 struct mem_cgroup_thresholds thresholds; 252 struct mem_cgroup_thresholds thresholds;
253 253
254 /* thresholds for mem+swap usage. RCU-protected */ 254 /* thresholds for mem+swap usage. RCU-protected */
255 struct mem_cgroup_thresholds memsw_thresholds; 255 struct mem_cgroup_thresholds memsw_thresholds;
256 256
257 /* For oom notifier event fd */ 257 /* For oom notifier event fd */
258 struct list_head oom_notify; 258 struct list_head oom_notify;
259 259
260 /* 260 /*
261 * Should we move charges of a task when a task is moved into this 261 * Should we move charges of a task when a task is moved into this
262 * mem_cgroup ? And what type of charges should we move ? 262 * mem_cgroup ? And what type of charges should we move ?
263 */ 263 */
264 unsigned long move_charge_at_immigrate; 264 unsigned long move_charge_at_immigrate;
265 /* 265 /*
266 * percpu counter. 266 * percpu counter.
267 */ 267 */
268 struct mem_cgroup_stat_cpu *stat; 268 struct mem_cgroup_stat_cpu *stat;
269 /* 269 /*
270 * used when a cpu is offlined or other synchronizations 270 * used when a cpu is offlined or other synchronizations
271 * See mem_cgroup_read_stat(). 271 * See mem_cgroup_read_stat().
272 */ 272 */
273 struct mem_cgroup_stat_cpu nocpu_base; 273 struct mem_cgroup_stat_cpu nocpu_base;
274 spinlock_t pcp_counter_lock; 274 spinlock_t pcp_counter_lock;
275 }; 275 };
276 276
277 /* Stuffs for move charges at task migration. */ 277 /* Stuffs for move charges at task migration. */
278 /* 278 /*
279 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a 279 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
280 * left-shifted bitmap of these types. 280 * left-shifted bitmap of these types.
281 */ 281 */
282 enum move_type { 282 enum move_type {
283 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 283 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
284 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 284 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
285 NR_MOVE_TYPE, 285 NR_MOVE_TYPE,
286 }; 286 };
287 287
288 /* "mc" and its members are protected by cgroup_mutex */ 288 /* "mc" and its members are protected by cgroup_mutex */
289 static struct move_charge_struct { 289 static struct move_charge_struct {
290 spinlock_t lock; /* for from, to */ 290 spinlock_t lock; /* for from, to */
291 struct mem_cgroup *from; 291 struct mem_cgroup *from;
292 struct mem_cgroup *to; 292 struct mem_cgroup *to;
293 unsigned long precharge; 293 unsigned long precharge;
294 unsigned long moved_charge; 294 unsigned long moved_charge;
295 unsigned long moved_swap; 295 unsigned long moved_swap;
296 struct task_struct *moving_task; /* a task moving charges */ 296 struct task_struct *moving_task; /* a task moving charges */
297 wait_queue_head_t waitq; /* a waitq for other context */ 297 wait_queue_head_t waitq; /* a waitq for other context */
298 } mc = { 298 } mc = {
299 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 299 .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
300 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 300 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
301 }; 301 };
302 302
303 static bool move_anon(void) 303 static bool move_anon(void)
304 { 304 {
305 return test_bit(MOVE_CHARGE_TYPE_ANON, 305 return test_bit(MOVE_CHARGE_TYPE_ANON,
306 &mc.to->move_charge_at_immigrate); 306 &mc.to->move_charge_at_immigrate);
307 } 307 }
308 308
309 static bool move_file(void) 309 static bool move_file(void)
310 { 310 {
311 return test_bit(MOVE_CHARGE_TYPE_FILE, 311 return test_bit(MOVE_CHARGE_TYPE_FILE,
312 &mc.to->move_charge_at_immigrate); 312 &mc.to->move_charge_at_immigrate);
313 } 313 }
314 314
315 /* 315 /*
316 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 316 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
317 * limit reclaim to prevent infinite loops, if they ever occur. 317 * limit reclaim to prevent infinite loops, if they ever occur.
318 */ 318 */
319 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 319 #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
320 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 320 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
321 321
322 enum charge_type { 322 enum charge_type {
323 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 323 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
324 MEM_CGROUP_CHARGE_TYPE_MAPPED, 324 MEM_CGROUP_CHARGE_TYPE_MAPPED,
325 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 325 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
326 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 326 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
327 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 327 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
328 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 328 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */
329 NR_CHARGE_TYPE, 329 NR_CHARGE_TYPE,
330 }; 330 };
331 331
332 /* for encoding cft->private value on file */ 332 /* for encoding cft->private value on file */
333 #define _MEM (0) 333 #define _MEM (0)
334 #define _MEMSWAP (1) 334 #define _MEMSWAP (1)
335 #define _OOM_TYPE (2) 335 #define _OOM_TYPE (2)
336 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 336 #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
337 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 337 #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
338 #define MEMFILE_ATTR(val) ((val) & 0xffff) 338 #define MEMFILE_ATTR(val) ((val) & 0xffff)
339 /* Used for OOM nofiier */ 339 /* Used for OOM nofiier */
340 #define OOM_CONTROL (0) 340 #define OOM_CONTROL (0)
341 341
342 /* 342 /*
343 * Reclaim flags for mem_cgroup_hierarchical_reclaim 343 * Reclaim flags for mem_cgroup_hierarchical_reclaim
344 */ 344 */
345 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 345 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
346 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 346 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
347 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 347 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
348 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 348 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
349 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 349 #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
350 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) 350 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
351 351
352 static void mem_cgroup_get(struct mem_cgroup *mem); 352 static void mem_cgroup_get(struct mem_cgroup *mem);
353 static void mem_cgroup_put(struct mem_cgroup *mem); 353 static void mem_cgroup_put(struct mem_cgroup *mem);
354 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 354 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
355 static void drain_all_stock_async(void); 355 static void drain_all_stock_async(void);
356 356
357 static struct mem_cgroup_per_zone * 357 static struct mem_cgroup_per_zone *
358 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 358 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
359 { 359 {
360 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 360 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
361 } 361 }
362 362
363 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) 363 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
364 { 364 {
365 return &mem->css; 365 return &mem->css;
366 } 366 }
367 367
368 static struct mem_cgroup_per_zone * 368 static struct mem_cgroup_per_zone *
369 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) 369 page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
370 { 370 {
371 int nid = page_to_nid(page); 371 int nid = page_to_nid(page);
372 int zid = page_zonenum(page); 372 int zid = page_zonenum(page);
373 373
374 return mem_cgroup_zoneinfo(mem, nid, zid); 374 return mem_cgroup_zoneinfo(mem, nid, zid);
375 } 375 }
376 376
377 static struct mem_cgroup_tree_per_zone * 377 static struct mem_cgroup_tree_per_zone *
378 soft_limit_tree_node_zone(int nid, int zid) 378 soft_limit_tree_node_zone(int nid, int zid)
379 { 379 {
380 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 380 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
381 } 381 }
382 382
383 static struct mem_cgroup_tree_per_zone * 383 static struct mem_cgroup_tree_per_zone *
384 soft_limit_tree_from_page(struct page *page) 384 soft_limit_tree_from_page(struct page *page)
385 { 385 {
386 int nid = page_to_nid(page); 386 int nid = page_to_nid(page);
387 int zid = page_zonenum(page); 387 int zid = page_zonenum(page);
388 388
389 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 389 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
390 } 390 }
391 391
392 static void 392 static void
393 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, 393 __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
394 struct mem_cgroup_per_zone *mz, 394 struct mem_cgroup_per_zone *mz,
395 struct mem_cgroup_tree_per_zone *mctz, 395 struct mem_cgroup_tree_per_zone *mctz,
396 unsigned long long new_usage_in_excess) 396 unsigned long long new_usage_in_excess)
397 { 397 {
398 struct rb_node **p = &mctz->rb_root.rb_node; 398 struct rb_node **p = &mctz->rb_root.rb_node;
399 struct rb_node *parent = NULL; 399 struct rb_node *parent = NULL;
400 struct mem_cgroup_per_zone *mz_node; 400 struct mem_cgroup_per_zone *mz_node;
401 401
402 if (mz->on_tree) 402 if (mz->on_tree)
403 return; 403 return;
404 404
405 mz->usage_in_excess = new_usage_in_excess; 405 mz->usage_in_excess = new_usage_in_excess;
406 if (!mz->usage_in_excess) 406 if (!mz->usage_in_excess)
407 return; 407 return;
408 while (*p) { 408 while (*p) {
409 parent = *p; 409 parent = *p;
410 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 410 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
411 tree_node); 411 tree_node);
412 if (mz->usage_in_excess < mz_node->usage_in_excess) 412 if (mz->usage_in_excess < mz_node->usage_in_excess)
413 p = &(*p)->rb_left; 413 p = &(*p)->rb_left;
414 /* 414 /*
415 * We can't avoid mem cgroups that are over their soft 415 * We can't avoid mem cgroups that are over their soft
416 * limit by the same amount 416 * limit by the same amount
417 */ 417 */
418 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 418 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
419 p = &(*p)->rb_right; 419 p = &(*p)->rb_right;
420 } 420 }
421 rb_link_node(&mz->tree_node, parent, p); 421 rb_link_node(&mz->tree_node, parent, p);
422 rb_insert_color(&mz->tree_node, &mctz->rb_root); 422 rb_insert_color(&mz->tree_node, &mctz->rb_root);
423 mz->on_tree = true; 423 mz->on_tree = true;
424 } 424 }
425 425
426 static void 426 static void
427 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 427 __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
428 struct mem_cgroup_per_zone *mz, 428 struct mem_cgroup_per_zone *mz,
429 struct mem_cgroup_tree_per_zone *mctz) 429 struct mem_cgroup_tree_per_zone *mctz)
430 { 430 {
431 if (!mz->on_tree) 431 if (!mz->on_tree)
432 return; 432 return;
433 rb_erase(&mz->tree_node, &mctz->rb_root); 433 rb_erase(&mz->tree_node, &mctz->rb_root);
434 mz->on_tree = false; 434 mz->on_tree = false;
435 } 435 }
436 436
437 static void 437 static void
438 mem_cgroup_remove_exceeded(struct mem_cgroup *mem, 438 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
439 struct mem_cgroup_per_zone *mz, 439 struct mem_cgroup_per_zone *mz,
440 struct mem_cgroup_tree_per_zone *mctz) 440 struct mem_cgroup_tree_per_zone *mctz)
441 { 441 {
442 spin_lock(&mctz->lock); 442 spin_lock(&mctz->lock);
443 __mem_cgroup_remove_exceeded(mem, mz, mctz); 443 __mem_cgroup_remove_exceeded(mem, mz, mctz);
444 spin_unlock(&mctz->lock); 444 spin_unlock(&mctz->lock);
445 } 445 }
446 446
447 447
448 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) 448 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
449 { 449 {
450 unsigned long long excess; 450 unsigned long long excess;
451 struct mem_cgroup_per_zone *mz; 451 struct mem_cgroup_per_zone *mz;
452 struct mem_cgroup_tree_per_zone *mctz; 452 struct mem_cgroup_tree_per_zone *mctz;
453 int nid = page_to_nid(page); 453 int nid = page_to_nid(page);
454 int zid = page_zonenum(page); 454 int zid = page_zonenum(page);
455 mctz = soft_limit_tree_from_page(page); 455 mctz = soft_limit_tree_from_page(page);
456 456
457 /* 457 /*
458 * Necessary to update all ancestors when hierarchy is used. 458 * Necessary to update all ancestors when hierarchy is used.
459 * because their event counter is not touched. 459 * because their event counter is not touched.
460 */ 460 */
461 for (; mem; mem = parent_mem_cgroup(mem)) { 461 for (; mem; mem = parent_mem_cgroup(mem)) {
462 mz = mem_cgroup_zoneinfo(mem, nid, zid); 462 mz = mem_cgroup_zoneinfo(mem, nid, zid);
463 excess = res_counter_soft_limit_excess(&mem->res); 463 excess = res_counter_soft_limit_excess(&mem->res);
464 /* 464 /*
465 * We have to update the tree if mz is on RB-tree or 465 * We have to update the tree if mz is on RB-tree or
466 * mem is over its softlimit. 466 * mem is over its softlimit.
467 */ 467 */
468 if (excess || mz->on_tree) { 468 if (excess || mz->on_tree) {
469 spin_lock(&mctz->lock); 469 spin_lock(&mctz->lock);
470 /* if on-tree, remove it */ 470 /* if on-tree, remove it */
471 if (mz->on_tree) 471 if (mz->on_tree)
472 __mem_cgroup_remove_exceeded(mem, mz, mctz); 472 __mem_cgroup_remove_exceeded(mem, mz, mctz);
473 /* 473 /*
474 * Insert again. mz->usage_in_excess will be updated. 474 * Insert again. mz->usage_in_excess will be updated.
475 * If excess is 0, no tree ops. 475 * If excess is 0, no tree ops.
476 */ 476 */
477 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); 477 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
478 spin_unlock(&mctz->lock); 478 spin_unlock(&mctz->lock);
479 } 479 }
480 } 480 }
481 } 481 }
482 482
483 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) 483 static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
484 { 484 {
485 int node, zone; 485 int node, zone;
486 struct mem_cgroup_per_zone *mz; 486 struct mem_cgroup_per_zone *mz;
487 struct mem_cgroup_tree_per_zone *mctz; 487 struct mem_cgroup_tree_per_zone *mctz;
488 488
489 for_each_node_state(node, N_POSSIBLE) { 489 for_each_node_state(node, N_POSSIBLE) {
490 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 490 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
491 mz = mem_cgroup_zoneinfo(mem, node, zone); 491 mz = mem_cgroup_zoneinfo(mem, node, zone);
492 mctz = soft_limit_tree_node_zone(node, zone); 492 mctz = soft_limit_tree_node_zone(node, zone);
493 mem_cgroup_remove_exceeded(mem, mz, mctz); 493 mem_cgroup_remove_exceeded(mem, mz, mctz);
494 } 494 }
495 } 495 }
496 } 496 }
497 497
498 static struct mem_cgroup_per_zone * 498 static struct mem_cgroup_per_zone *
499 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 499 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
500 { 500 {
501 struct rb_node *rightmost = NULL; 501 struct rb_node *rightmost = NULL;
502 struct mem_cgroup_per_zone *mz; 502 struct mem_cgroup_per_zone *mz;
503 503
504 retry: 504 retry:
505 mz = NULL; 505 mz = NULL;
506 rightmost = rb_last(&mctz->rb_root); 506 rightmost = rb_last(&mctz->rb_root);
507 if (!rightmost) 507 if (!rightmost)
508 goto done; /* Nothing to reclaim from */ 508 goto done; /* Nothing to reclaim from */
509 509
510 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 510 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
511 /* 511 /*
512 * Remove the node now but someone else can add it back, 512 * Remove the node now but someone else can add it back,
513 * we will to add it back at the end of reclaim to its correct 513 * we will to add it back at the end of reclaim to its correct
514 * position in the tree. 514 * position in the tree.
515 */ 515 */
516 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 516 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
517 if (!res_counter_soft_limit_excess(&mz->mem->res) || 517 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
518 !css_tryget(&mz->mem->css)) 518 !css_tryget(&mz->mem->css))
519 goto retry; 519 goto retry;
520 done: 520 done:
521 return mz; 521 return mz;
522 } 522 }
523 523
524 static struct mem_cgroup_per_zone * 524 static struct mem_cgroup_per_zone *
525 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 525 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
526 { 526 {
527 struct mem_cgroup_per_zone *mz; 527 struct mem_cgroup_per_zone *mz;
528 528
529 spin_lock(&mctz->lock); 529 spin_lock(&mctz->lock);
530 mz = __mem_cgroup_largest_soft_limit_node(mctz); 530 mz = __mem_cgroup_largest_soft_limit_node(mctz);
531 spin_unlock(&mctz->lock); 531 spin_unlock(&mctz->lock);
532 return mz; 532 return mz;
533 } 533 }
534 534
535 /* 535 /*
536 * Implementation Note: reading percpu statistics for memcg. 536 * Implementation Note: reading percpu statistics for memcg.
537 * 537 *
538 * Both of vmstat[] and percpu_counter has threshold and do periodic 538 * Both of vmstat[] and percpu_counter has threshold and do periodic
539 * synchronization to implement "quick" read. There are trade-off between 539 * synchronization to implement "quick" read. There are trade-off between
540 * reading cost and precision of value. Then, we may have a chance to implement 540 * reading cost and precision of value. Then, we may have a chance to implement
541 * a periodic synchronizion of counter in memcg's counter. 541 * a periodic synchronizion of counter in memcg's counter.
542 * 542 *
543 * But this _read() function is used for user interface now. The user accounts 543 * But this _read() function is used for user interface now. The user accounts
544 * memory usage by memory cgroup and he _always_ requires exact value because 544 * memory usage by memory cgroup and he _always_ requires exact value because
545 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 545 * he accounts memory. Even if we provide quick-and-fuzzy read, we always
546 * have to visit all online cpus and make sum. So, for now, unnecessary 546 * have to visit all online cpus and make sum. So, for now, unnecessary
547 * synchronization is not implemented. (just implemented for cpu hotplug) 547 * synchronization is not implemented. (just implemented for cpu hotplug)
548 * 548 *
549 * If there are kernel internal actions which can make use of some not-exact 549 * If there are kernel internal actions which can make use of some not-exact
550 * value, and reading all cpu value can be performance bottleneck in some 550 * value, and reading all cpu value can be performance bottleneck in some
551 * common workload, threashold and synchonization as vmstat[] should be 551 * common workload, threashold and synchonization as vmstat[] should be
552 * implemented. 552 * implemented.
553 */ 553 */
554 static long mem_cgroup_read_stat(struct mem_cgroup *mem, 554 static long mem_cgroup_read_stat(struct mem_cgroup *mem,
555 enum mem_cgroup_stat_index idx) 555 enum mem_cgroup_stat_index idx)
556 { 556 {
557 long val = 0; 557 long val = 0;
558 int cpu; 558 int cpu;
559 559
560 get_online_cpus(); 560 get_online_cpus();
561 for_each_online_cpu(cpu) 561 for_each_online_cpu(cpu)
562 val += per_cpu(mem->stat->count[idx], cpu); 562 val += per_cpu(mem->stat->count[idx], cpu);
563 #ifdef CONFIG_HOTPLUG_CPU 563 #ifdef CONFIG_HOTPLUG_CPU
564 spin_lock(&mem->pcp_counter_lock); 564 spin_lock(&mem->pcp_counter_lock);
565 val += mem->nocpu_base.count[idx]; 565 val += mem->nocpu_base.count[idx];
566 spin_unlock(&mem->pcp_counter_lock); 566 spin_unlock(&mem->pcp_counter_lock);
567 #endif 567 #endif
568 put_online_cpus(); 568 put_online_cpus();
569 return val; 569 return val;
570 } 570 }
571 571
572 static long mem_cgroup_local_usage(struct mem_cgroup *mem) 572 static long mem_cgroup_local_usage(struct mem_cgroup *mem)
573 { 573 {
574 long ret; 574 long ret;
575 575
576 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 576 ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
577 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 577 ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
578 return ret; 578 return ret;
579 } 579 }
580 580
581 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, 581 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
582 bool charge) 582 bool charge)
583 { 583 {
584 int val = (charge) ? 1 : -1; 584 int val = (charge) ? 1 : -1;
585 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); 585 this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
586 } 586 }
587 587
588 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, 588 static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
589 enum mem_cgroup_events_index idx) 589 enum mem_cgroup_events_index idx)
590 { 590 {
591 unsigned long val = 0; 591 unsigned long val = 0;
592 int cpu; 592 int cpu;
593 593
594 for_each_online_cpu(cpu) 594 for_each_online_cpu(cpu)
595 val += per_cpu(mem->stat->events[idx], cpu); 595 val += per_cpu(mem->stat->events[idx], cpu);
596 #ifdef CONFIG_HOTPLUG_CPU 596 #ifdef CONFIG_HOTPLUG_CPU
597 spin_lock(&mem->pcp_counter_lock); 597 spin_lock(&mem->pcp_counter_lock);
598 val += mem->nocpu_base.events[idx]; 598 val += mem->nocpu_base.events[idx];
599 spin_unlock(&mem->pcp_counter_lock); 599 spin_unlock(&mem->pcp_counter_lock);
600 #endif 600 #endif
601 return val; 601 return val;
602 } 602 }
603 603
604 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 604 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
605 bool file, int nr_pages) 605 bool file, int nr_pages)
606 { 606 {
607 preempt_disable(); 607 preempt_disable();
608 608
609 if (file) 609 if (file)
610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); 610 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
611 else 611 else
612 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); 612 __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
613 613
614 /* pagein of a big page is an event. So, ignore page size */ 614 /* pagein of a big page is an event. So, ignore page size */
615 if (nr_pages > 0) 615 if (nr_pages > 0)
616 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 616 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
617 else { 617 else {
618 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 618 __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
619 nr_pages = -nr_pages; /* for event */ 619 nr_pages = -nr_pages; /* for event */
620 } 620 }
621 621
622 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 622 __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
623 623
624 preempt_enable(); 624 preempt_enable();
625 } 625 }
626 626
627 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 627 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628 enum lru_list idx) 628 enum lru_list idx)
629 { 629 {
630 int nid, zid; 630 int nid, zid;
631 struct mem_cgroup_per_zone *mz; 631 struct mem_cgroup_per_zone *mz;
632 u64 total = 0; 632 u64 total = 0;
633 633
634 for_each_online_node(nid) 634 for_each_online_node(nid)
635 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 635 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
636 mz = mem_cgroup_zoneinfo(mem, nid, zid); 636 mz = mem_cgroup_zoneinfo(mem, nid, zid);
637 total += MEM_CGROUP_ZSTAT(mz, idx); 637 total += MEM_CGROUP_ZSTAT(mz, idx);
638 } 638 }
639 return total; 639 return total;
640 } 640 }
641 641
642 static bool __memcg_event_check(struct mem_cgroup *mem, int target) 642 static bool __memcg_event_check(struct mem_cgroup *mem, int target)
643 { 643 {
644 unsigned long val, next; 644 unsigned long val, next;
645 645
646 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 646 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
647 next = this_cpu_read(mem->stat->targets[target]); 647 next = this_cpu_read(mem->stat->targets[target]);
648 /* from time_after() in jiffies.h */ 648 /* from time_after() in jiffies.h */
649 return ((long)next - (long)val < 0); 649 return ((long)next - (long)val < 0);
650 } 650 }
651 651
652 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) 652 static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
653 { 653 {
654 unsigned long val, next; 654 unsigned long val, next;
655 655
656 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); 656 val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
657 657
658 switch (target) { 658 switch (target) {
659 case MEM_CGROUP_TARGET_THRESH: 659 case MEM_CGROUP_TARGET_THRESH:
660 next = val + THRESHOLDS_EVENTS_TARGET; 660 next = val + THRESHOLDS_EVENTS_TARGET;
661 break; 661 break;
662 case MEM_CGROUP_TARGET_SOFTLIMIT: 662 case MEM_CGROUP_TARGET_SOFTLIMIT:
663 next = val + SOFTLIMIT_EVENTS_TARGET; 663 next = val + SOFTLIMIT_EVENTS_TARGET;
664 break; 664 break;
665 default: 665 default:
666 return; 666 return;
667 } 667 }
668 668
669 this_cpu_write(mem->stat->targets[target], next); 669 this_cpu_write(mem->stat->targets[target], next);
670 } 670 }
671 671
672 /* 672 /*
673 * Check events in order. 673 * Check events in order.
674 * 674 *
675 */ 675 */
676 static void memcg_check_events(struct mem_cgroup *mem, struct page *page) 676 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
677 { 677 {
678 /* threshold event is triggered in finer grain than soft limit */ 678 /* threshold event is triggered in finer grain than soft limit */
679 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { 679 if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
680 mem_cgroup_threshold(mem); 680 mem_cgroup_threshold(mem);
681 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); 681 __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
682 if (unlikely(__memcg_event_check(mem, 682 if (unlikely(__memcg_event_check(mem,
683 MEM_CGROUP_TARGET_SOFTLIMIT))){ 683 MEM_CGROUP_TARGET_SOFTLIMIT))){
684 mem_cgroup_update_tree(mem, page); 684 mem_cgroup_update_tree(mem, page);
685 __mem_cgroup_target_update(mem, 685 __mem_cgroup_target_update(mem,
686 MEM_CGROUP_TARGET_SOFTLIMIT); 686 MEM_CGROUP_TARGET_SOFTLIMIT);
687 } 687 }
688 } 688 }
689 } 689 }
690 690
691 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) 691 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
692 { 692 {
693 return container_of(cgroup_subsys_state(cont, 693 return container_of(cgroup_subsys_state(cont,
694 mem_cgroup_subsys_id), struct mem_cgroup, 694 mem_cgroup_subsys_id), struct mem_cgroup,
695 css); 695 css);
696 } 696 }
697 697
698 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 698 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
699 { 699 {
700 /* 700 /*
701 * mm_update_next_owner() may clear mm->owner to NULL 701 * mm_update_next_owner() may clear mm->owner to NULL
702 * if it races with swapoff, page migration, etc. 702 * if it races with swapoff, page migration, etc.
703 * So this can be called with p == NULL. 703 * So this can be called with p == NULL.
704 */ 704 */
705 if (unlikely(!p)) 705 if (unlikely(!p))
706 return NULL; 706 return NULL;
707 707
708 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 708 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
709 struct mem_cgroup, css); 709 struct mem_cgroup, css);
710 } 710 }
711 711
712 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 712 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
713 { 713 {
714 struct mem_cgroup *mem = NULL; 714 struct mem_cgroup *mem = NULL;
715 715
716 if (!mm) 716 if (!mm)
717 return NULL; 717 return NULL;
718 /* 718 /*
719 * Because we have no locks, mm->owner's may be being moved to other 719 * Because we have no locks, mm->owner's may be being moved to other
720 * cgroup. We use css_tryget() here even if this looks 720 * cgroup. We use css_tryget() here even if this looks
721 * pessimistic (rather than adding locks here). 721 * pessimistic (rather than adding locks here).
722 */ 722 */
723 rcu_read_lock(); 723 rcu_read_lock();
724 do { 724 do {
725 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 725 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
726 if (unlikely(!mem)) 726 if (unlikely(!mem))
727 break; 727 break;
728 } while (!css_tryget(&mem->css)); 728 } while (!css_tryget(&mem->css));
729 rcu_read_unlock(); 729 rcu_read_unlock();
730 return mem; 730 return mem;
731 } 731 }
732 732
733 /* The caller has to guarantee "mem" exists before calling this */ 733 /* The caller has to guarantee "mem" exists before calling this */
734 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) 734 static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
735 { 735 {
736 struct cgroup_subsys_state *css; 736 struct cgroup_subsys_state *css;
737 int found; 737 int found;
738 738
739 if (!mem) /* ROOT cgroup has the smallest ID */ 739 if (!mem) /* ROOT cgroup has the smallest ID */
740 return root_mem_cgroup; /*css_put/get against root is ignored*/ 740 return root_mem_cgroup; /*css_put/get against root is ignored*/
741 if (!mem->use_hierarchy) { 741 if (!mem->use_hierarchy) {
742 if (css_tryget(&mem->css)) 742 if (css_tryget(&mem->css))
743 return mem; 743 return mem;
744 return NULL; 744 return NULL;
745 } 745 }
746 rcu_read_lock(); 746 rcu_read_lock();
747 /* 747 /*
748 * searching a memory cgroup which has the smallest ID under given 748 * searching a memory cgroup which has the smallest ID under given
749 * ROOT cgroup. (ID >= 1) 749 * ROOT cgroup. (ID >= 1)
750 */ 750 */
751 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); 751 css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
752 if (css && css_tryget(css)) 752 if (css && css_tryget(css))
753 mem = container_of(css, struct mem_cgroup, css); 753 mem = container_of(css, struct mem_cgroup, css);
754 else 754 else
755 mem = NULL; 755 mem = NULL;
756 rcu_read_unlock(); 756 rcu_read_unlock();
757 return mem; 757 return mem;
758 } 758 }
759 759
760 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, 760 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
761 struct mem_cgroup *root, 761 struct mem_cgroup *root,
762 bool cond) 762 bool cond)
763 { 763 {
764 int nextid = css_id(&iter->css) + 1; 764 int nextid = css_id(&iter->css) + 1;
765 int found; 765 int found;
766 int hierarchy_used; 766 int hierarchy_used;
767 struct cgroup_subsys_state *css; 767 struct cgroup_subsys_state *css;
768 768
769 hierarchy_used = iter->use_hierarchy; 769 hierarchy_used = iter->use_hierarchy;
770 770
771 css_put(&iter->css); 771 css_put(&iter->css);
772 /* If no ROOT, walk all, ignore hierarchy */ 772 /* If no ROOT, walk all, ignore hierarchy */
773 if (!cond || (root && !hierarchy_used)) 773 if (!cond || (root && !hierarchy_used))
774 return NULL; 774 return NULL;
775 775
776 if (!root) 776 if (!root)
777 root = root_mem_cgroup; 777 root = root_mem_cgroup;
778 778
779 do { 779 do {
780 iter = NULL; 780 iter = NULL;
781 rcu_read_lock(); 781 rcu_read_lock();
782 782
783 css = css_get_next(&mem_cgroup_subsys, nextid, 783 css = css_get_next(&mem_cgroup_subsys, nextid,
784 &root->css, &found); 784 &root->css, &found);
785 if (css && css_tryget(css)) 785 if (css && css_tryget(css))
786 iter = container_of(css, struct mem_cgroup, css); 786 iter = container_of(css, struct mem_cgroup, css);
787 rcu_read_unlock(); 787 rcu_read_unlock();
788 /* If css is NULL, no more cgroups will be found */ 788 /* If css is NULL, no more cgroups will be found */
789 nextid = found + 1; 789 nextid = found + 1;
790 } while (css && !iter); 790 } while (css && !iter);
791 791
792 return iter; 792 return iter;
793 } 793 }
794 /* 794 /*
795 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please 795 * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
796 * be careful that "break" loop is not allowed. We have reference count. 796 * be careful that "break" loop is not allowed. We have reference count.
797 * Instead of that modify "cond" to be false and "continue" to exit the loop. 797 * Instead of that modify "cond" to be false and "continue" to exit the loop.
798 */ 798 */
799 #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ 799 #define for_each_mem_cgroup_tree_cond(iter, root, cond) \
800 for (iter = mem_cgroup_start_loop(root);\ 800 for (iter = mem_cgroup_start_loop(root);\
801 iter != NULL;\ 801 iter != NULL;\
802 iter = mem_cgroup_get_next(iter, root, cond)) 802 iter = mem_cgroup_get_next(iter, root, cond))
803 803
804 #define for_each_mem_cgroup_tree(iter, root) \ 804 #define for_each_mem_cgroup_tree(iter, root) \
805 for_each_mem_cgroup_tree_cond(iter, root, true) 805 for_each_mem_cgroup_tree_cond(iter, root, true)
806 806
807 #define for_each_mem_cgroup_all(iter) \ 807 #define for_each_mem_cgroup_all(iter) \
808 for_each_mem_cgroup_tree_cond(iter, NULL, true) 808 for_each_mem_cgroup_tree_cond(iter, NULL, true)
809 809
810 810
811 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) 811 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
812 { 812 {
813 return (mem == root_mem_cgroup); 813 return (mem == root_mem_cgroup);
814 } 814 }
815 815
816 /* 816 /*
817 * Following LRU functions are allowed to be used without PCG_LOCK. 817 * Following LRU functions are allowed to be used without PCG_LOCK.
818 * Operations are called by routine of global LRU independently from memcg. 818 * Operations are called by routine of global LRU independently from memcg.
819 * What we have to take care of here is validness of pc->mem_cgroup. 819 * What we have to take care of here is validness of pc->mem_cgroup.
820 * 820 *
821 * Changes to pc->mem_cgroup happens when 821 * Changes to pc->mem_cgroup happens when
822 * 1. charge 822 * 1. charge
823 * 2. moving account 823 * 2. moving account
824 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 824 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
825 * It is added to LRU before charge. 825 * It is added to LRU before charge.
826 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 826 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
827 * When moving account, the page is not on LRU. It's isolated. 827 * When moving account, the page is not on LRU. It's isolated.
828 */ 828 */
829 829
830 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 830 void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
831 { 831 {
832 struct page_cgroup *pc; 832 struct page_cgroup *pc;
833 struct mem_cgroup_per_zone *mz; 833 struct mem_cgroup_per_zone *mz;
834 834
835 if (mem_cgroup_disabled()) 835 if (mem_cgroup_disabled())
836 return; 836 return;
837 pc = lookup_page_cgroup(page); 837 pc = lookup_page_cgroup(page);
838 /* can happen while we handle swapcache. */ 838 /* can happen while we handle swapcache. */
839 if (!TestClearPageCgroupAcctLRU(pc)) 839 if (!TestClearPageCgroupAcctLRU(pc))
840 return; 840 return;
841 VM_BUG_ON(!pc->mem_cgroup); 841 VM_BUG_ON(!pc->mem_cgroup);
842 /* 842 /*
843 * We don't check PCG_USED bit. It's cleared when the "page" is finally 843 * We don't check PCG_USED bit. It's cleared when the "page" is finally
844 * removed from global LRU. 844 * removed from global LRU.
845 */ 845 */
846 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 846 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
847 /* huge page split is done under lru_lock. so, we have no races. */ 847 /* huge page split is done under lru_lock. so, we have no races. */
848 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); 848 MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
849 if (mem_cgroup_is_root(pc->mem_cgroup)) 849 if (mem_cgroup_is_root(pc->mem_cgroup))
850 return; 850 return;
851 VM_BUG_ON(list_empty(&pc->lru)); 851 VM_BUG_ON(list_empty(&pc->lru));
852 list_del_init(&pc->lru); 852 list_del_init(&pc->lru);
853 } 853 }
854 854
855 void mem_cgroup_del_lru(struct page *page) 855 void mem_cgroup_del_lru(struct page *page)
856 { 856 {
857 mem_cgroup_del_lru_list(page, page_lru(page)); 857 mem_cgroup_del_lru_list(page, page_lru(page));
858 } 858 }
859 859
860 /* 860 /*
861 * Writeback is about to end against a page which has been marked for immediate 861 * Writeback is about to end against a page which has been marked for immediate
862 * reclaim. If it still appears to be reclaimable, move it to the tail of the 862 * reclaim. If it still appears to be reclaimable, move it to the tail of the
863 * inactive list. 863 * inactive list.
864 */ 864 */
865 void mem_cgroup_rotate_reclaimable_page(struct page *page) 865 void mem_cgroup_rotate_reclaimable_page(struct page *page)
866 { 866 {
867 struct mem_cgroup_per_zone *mz; 867 struct mem_cgroup_per_zone *mz;
868 struct page_cgroup *pc; 868 struct page_cgroup *pc;
869 enum lru_list lru = page_lru(page); 869 enum lru_list lru = page_lru(page);
870 870
871 if (mem_cgroup_disabled()) 871 if (mem_cgroup_disabled())
872 return; 872 return;
873 873
874 pc = lookup_page_cgroup(page); 874 pc = lookup_page_cgroup(page);
875 /* unused or root page is not rotated. */ 875 /* unused or root page is not rotated. */
876 if (!PageCgroupUsed(pc)) 876 if (!PageCgroupUsed(pc))
877 return; 877 return;
878 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 878 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
879 smp_rmb(); 879 smp_rmb();
880 if (mem_cgroup_is_root(pc->mem_cgroup)) 880 if (mem_cgroup_is_root(pc->mem_cgroup))
881 return; 881 return;
882 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 882 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
883 list_move_tail(&pc->lru, &mz->lists[lru]); 883 list_move_tail(&pc->lru, &mz->lists[lru]);
884 } 884 }
885 885
886 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) 886 void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
887 { 887 {
888 struct mem_cgroup_per_zone *mz; 888 struct mem_cgroup_per_zone *mz;
889 struct page_cgroup *pc; 889 struct page_cgroup *pc;
890 890
891 if (mem_cgroup_disabled()) 891 if (mem_cgroup_disabled())
892 return; 892 return;
893 893
894 pc = lookup_page_cgroup(page); 894 pc = lookup_page_cgroup(page);
895 /* unused or root page is not rotated. */ 895 /* unused or root page is not rotated. */
896 if (!PageCgroupUsed(pc)) 896 if (!PageCgroupUsed(pc))
897 return; 897 return;
898 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 898 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
899 smp_rmb(); 899 smp_rmb();
900 if (mem_cgroup_is_root(pc->mem_cgroup)) 900 if (mem_cgroup_is_root(pc->mem_cgroup))
901 return; 901 return;
902 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 902 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
903 list_move(&pc->lru, &mz->lists[lru]); 903 list_move(&pc->lru, &mz->lists[lru]);
904 } 904 }
905 905
906 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) 906 void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
907 { 907 {
908 struct page_cgroup *pc; 908 struct page_cgroup *pc;
909 struct mem_cgroup_per_zone *mz; 909 struct mem_cgroup_per_zone *mz;
910 910
911 if (mem_cgroup_disabled()) 911 if (mem_cgroup_disabled())
912 return; 912 return;
913 pc = lookup_page_cgroup(page); 913 pc = lookup_page_cgroup(page);
914 VM_BUG_ON(PageCgroupAcctLRU(pc)); 914 VM_BUG_ON(PageCgroupAcctLRU(pc));
915 if (!PageCgroupUsed(pc)) 915 if (!PageCgroupUsed(pc))
916 return; 916 return;
917 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 917 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
918 smp_rmb(); 918 smp_rmb();
919 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 919 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
920 /* huge page split is done under lru_lock. so, we have no races. */ 920 /* huge page split is done under lru_lock. so, we have no races. */
921 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); 921 MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
922 SetPageCgroupAcctLRU(pc); 922 SetPageCgroupAcctLRU(pc);
923 if (mem_cgroup_is_root(pc->mem_cgroup)) 923 if (mem_cgroup_is_root(pc->mem_cgroup))
924 return; 924 return;
925 list_add(&pc->lru, &mz->lists[lru]); 925 list_add(&pc->lru, &mz->lists[lru]);
926 } 926 }
927 927
928 /* 928 /*
929 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed 929 * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
930 * while it's linked to lru because the page may be reused after it's fully 930 * while it's linked to lru because the page may be reused after it's fully
931 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again. 931 * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
932 * It's done under lock_page and expected that zone->lru_lock isnever held. 932 * It's done under lock_page and expected that zone->lru_lock isnever held.
933 */ 933 */
934 static void mem_cgroup_lru_del_before_commit(struct page *page) 934 static void mem_cgroup_lru_del_before_commit(struct page *page)
935 { 935 {
936 unsigned long flags; 936 unsigned long flags;
937 struct zone *zone = page_zone(page); 937 struct zone *zone = page_zone(page);
938 struct page_cgroup *pc = lookup_page_cgroup(page); 938 struct page_cgroup *pc = lookup_page_cgroup(page);
939 939
940 /* 940 /*
941 * Doing this check without taking ->lru_lock seems wrong but this 941 * Doing this check without taking ->lru_lock seems wrong but this
942 * is safe. Because if page_cgroup's USED bit is unset, the page 942 * is safe. Because if page_cgroup's USED bit is unset, the page
943 * will not be added to any memcg's LRU. If page_cgroup's USED bit is 943 * will not be added to any memcg's LRU. If page_cgroup's USED bit is
944 * set, the commit after this will fail, anyway. 944 * set, the commit after this will fail, anyway.
945 * This all charge/uncharge is done under some mutual execustion. 945 * This all charge/uncharge is done under some mutual execustion.
946 * So, we don't need to taking care of changes in USED bit. 946 * So, we don't need to taking care of changes in USED bit.
947 */ 947 */
948 if (likely(!PageLRU(page))) 948 if (likely(!PageLRU(page)))
949 return; 949 return;
950 950
951 spin_lock_irqsave(&zone->lru_lock, flags); 951 spin_lock_irqsave(&zone->lru_lock, flags);
952 /* 952 /*
953 * Forget old LRU when this page_cgroup is *not* used. This Used bit 953 * Forget old LRU when this page_cgroup is *not* used. This Used bit
954 * is guarded by lock_page() because the page is SwapCache. 954 * is guarded by lock_page() because the page is SwapCache.
955 */ 955 */
956 if (!PageCgroupUsed(pc)) 956 if (!PageCgroupUsed(pc))
957 mem_cgroup_del_lru_list(page, page_lru(page)); 957 mem_cgroup_del_lru_list(page, page_lru(page));
958 spin_unlock_irqrestore(&zone->lru_lock, flags); 958 spin_unlock_irqrestore(&zone->lru_lock, flags);
959 } 959 }
960 960
961 static void mem_cgroup_lru_add_after_commit(struct page *page) 961 static void mem_cgroup_lru_add_after_commit(struct page *page)
962 { 962 {
963 unsigned long flags; 963 unsigned long flags;
964 struct zone *zone = page_zone(page); 964 struct zone *zone = page_zone(page);
965 struct page_cgroup *pc = lookup_page_cgroup(page); 965 struct page_cgroup *pc = lookup_page_cgroup(page);
966 966
967 /* taking care of that the page is added to LRU while we commit it */ 967 /* taking care of that the page is added to LRU while we commit it */
968 if (likely(!PageLRU(page))) 968 if (likely(!PageLRU(page)))
969 return; 969 return;
970 spin_lock_irqsave(&zone->lru_lock, flags); 970 spin_lock_irqsave(&zone->lru_lock, flags);
971 /* link when the page is linked to LRU but page_cgroup isn't */ 971 /* link when the page is linked to LRU but page_cgroup isn't */
972 if (PageLRU(page) && !PageCgroupAcctLRU(pc)) 972 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
973 mem_cgroup_add_lru_list(page, page_lru(page)); 973 mem_cgroup_add_lru_list(page, page_lru(page));
974 spin_unlock_irqrestore(&zone->lru_lock, flags); 974 spin_unlock_irqrestore(&zone->lru_lock, flags);
975 } 975 }
976 976
977 977
978 void mem_cgroup_move_lists(struct page *page, 978 void mem_cgroup_move_lists(struct page *page,
979 enum lru_list from, enum lru_list to) 979 enum lru_list from, enum lru_list to)
980 { 980 {
981 if (mem_cgroup_disabled()) 981 if (mem_cgroup_disabled())
982 return; 982 return;
983 mem_cgroup_del_lru_list(page, from); 983 mem_cgroup_del_lru_list(page, from);
984 mem_cgroup_add_lru_list(page, to); 984 mem_cgroup_add_lru_list(page, to);
985 } 985 }
986 986
987 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 987 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
988 { 988 {
989 int ret; 989 int ret;
990 struct mem_cgroup *curr = NULL; 990 struct mem_cgroup *curr = NULL;
991 struct task_struct *p; 991 struct task_struct *p;
992 992
993 p = find_lock_task_mm(task); 993 p = find_lock_task_mm(task);
994 if (!p) 994 if (!p)
995 return 0; 995 return 0;
996 curr = try_get_mem_cgroup_from_mm(p->mm); 996 curr = try_get_mem_cgroup_from_mm(p->mm);
997 task_unlock(p); 997 task_unlock(p);
998 if (!curr) 998 if (!curr)
999 return 0; 999 return 0;
1000 /* 1000 /*
1001 * We should check use_hierarchy of "mem" not "curr". Because checking 1001 * We should check use_hierarchy of "mem" not "curr". Because checking
1002 * use_hierarchy of "curr" here make this function true if hierarchy is 1002 * use_hierarchy of "curr" here make this function true if hierarchy is
1003 * enabled in "curr" and "curr" is a child of "mem" in *cgroup* 1003 * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
1004 * hierarchy(even if use_hierarchy is disabled in "mem"). 1004 * hierarchy(even if use_hierarchy is disabled in "mem").
1005 */ 1005 */
1006 if (mem->use_hierarchy) 1006 if (mem->use_hierarchy)
1007 ret = css_is_ancestor(&curr->css, &mem->css); 1007 ret = css_is_ancestor(&curr->css, &mem->css);
1008 else 1008 else
1009 ret = (curr == mem); 1009 ret = (curr == mem);
1010 css_put(&curr->css); 1010 css_put(&curr->css);
1011 return ret; 1011 return ret;
1012 } 1012 }
1013 1013
1014 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) 1014 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
1015 { 1015 {
1016 unsigned long active; 1016 unsigned long active;
1017 unsigned long inactive; 1017 unsigned long inactive;
1018 unsigned long gb; 1018 unsigned long gb;
1019 unsigned long inactive_ratio; 1019 unsigned long inactive_ratio;
1020 1020
1021 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); 1021 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
1022 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); 1022 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
1023 1023
1024 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1024 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1025 if (gb) 1025 if (gb)
1026 inactive_ratio = int_sqrt(10 * gb); 1026 inactive_ratio = int_sqrt(10 * gb);
1027 else 1027 else
1028 inactive_ratio = 1; 1028 inactive_ratio = 1;
1029 1029
1030 if (present_pages) { 1030 if (present_pages) {
1031 present_pages[0] = inactive; 1031 present_pages[0] = inactive;
1032 present_pages[1] = active; 1032 present_pages[1] = active;
1033 } 1033 }
1034 1034
1035 return inactive_ratio; 1035 return inactive_ratio;
1036 } 1036 }
1037 1037
1038 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) 1038 int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
1039 { 1039 {
1040 unsigned long active; 1040 unsigned long active;
1041 unsigned long inactive; 1041 unsigned long inactive;
1042 unsigned long present_pages[2]; 1042 unsigned long present_pages[2];
1043 unsigned long inactive_ratio; 1043 unsigned long inactive_ratio;
1044 1044
1045 inactive_ratio = calc_inactive_ratio(memcg, present_pages); 1045 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
1046 1046
1047 inactive = present_pages[0]; 1047 inactive = present_pages[0];
1048 active = present_pages[1]; 1048 active = present_pages[1];
1049 1049
1050 if (inactive * inactive_ratio < active) 1050 if (inactive * inactive_ratio < active)
1051 return 1; 1051 return 1;
1052 1052
1053 return 0; 1053 return 0;
1054 } 1054 }
1055 1055
1056 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) 1056 int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
1057 { 1057 {
1058 unsigned long active; 1058 unsigned long active;
1059 unsigned long inactive; 1059 unsigned long inactive;
1060 1060
1061 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); 1061 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE);
1062 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); 1062 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE);
1063 1063
1064 return (active > inactive); 1064 return (active > inactive);
1065 } 1065 }
1066 1066
1067 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 1067 unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
1068 struct zone *zone, 1068 struct zone *zone,
1069 enum lru_list lru) 1069 enum lru_list lru)
1070 { 1070 {
1071 int nid = zone_to_nid(zone); 1071 int nid = zone_to_nid(zone);
1072 int zid = zone_idx(zone); 1072 int zid = zone_idx(zone);
1073 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1073 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1074 1074
1075 return MEM_CGROUP_ZSTAT(mz, lru); 1075 return MEM_CGROUP_ZSTAT(mz, lru);
1076 } 1076 }
1077 1077
1078 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, 1078 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1079 struct zone *zone) 1079 struct zone *zone)
1080 { 1080 {
1081 int nid = zone_to_nid(zone); 1081 int nid = zone_to_nid(zone);
1082 int zid = zone_idx(zone); 1082 int zid = zone_idx(zone);
1083 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); 1083 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1084 1084
1085 return &mz->reclaim_stat; 1085 return &mz->reclaim_stat;
1086 } 1086 }
1087 1087
1088 struct zone_reclaim_stat * 1088 struct zone_reclaim_stat *
1089 mem_cgroup_get_reclaim_stat_from_page(struct page *page) 1089 mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1090 { 1090 {
1091 struct page_cgroup *pc; 1091 struct page_cgroup *pc;
1092 struct mem_cgroup_per_zone *mz; 1092 struct mem_cgroup_per_zone *mz;
1093 1093
1094 if (mem_cgroup_disabled()) 1094 if (mem_cgroup_disabled())
1095 return NULL; 1095 return NULL;
1096 1096
1097 pc = lookup_page_cgroup(page); 1097 pc = lookup_page_cgroup(page);
1098 if (!PageCgroupUsed(pc)) 1098 if (!PageCgroupUsed(pc))
1099 return NULL; 1099 return NULL;
1100 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ 1100 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1101 smp_rmb(); 1101 smp_rmb();
1102 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page); 1102 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1103 return &mz->reclaim_stat; 1103 return &mz->reclaim_stat;
1104 } 1104 }
1105 1105
1106 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 1106 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
1107 struct list_head *dst, 1107 struct list_head *dst,
1108 unsigned long *scanned, int order, 1108 unsigned long *scanned, int order,
1109 int mode, struct zone *z, 1109 int mode, struct zone *z,
1110 struct mem_cgroup *mem_cont, 1110 struct mem_cgroup *mem_cont,
1111 int active, int file) 1111 int active, int file)
1112 { 1112 {
1113 unsigned long nr_taken = 0; 1113 unsigned long nr_taken = 0;
1114 struct page *page; 1114 struct page *page;
1115 unsigned long scan; 1115 unsigned long scan;
1116 LIST_HEAD(pc_list); 1116 LIST_HEAD(pc_list);
1117 struct list_head *src; 1117 struct list_head *src;
1118 struct page_cgroup *pc, *tmp; 1118 struct page_cgroup *pc, *tmp;
1119 int nid = zone_to_nid(z); 1119 int nid = zone_to_nid(z);
1120 int zid = zone_idx(z); 1120 int zid = zone_idx(z);
1121 struct mem_cgroup_per_zone *mz; 1121 struct mem_cgroup_per_zone *mz;
1122 int lru = LRU_FILE * file + active; 1122 int lru = LRU_FILE * file + active;
1123 int ret; 1123 int ret;
1124 1124
1125 BUG_ON(!mem_cont); 1125 BUG_ON(!mem_cont);
1126 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 1126 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1127 src = &mz->lists[lru]; 1127 src = &mz->lists[lru];
1128 1128
1129 scan = 0; 1129 scan = 0;
1130 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 1130 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
1131 if (scan >= nr_to_scan) 1131 if (scan >= nr_to_scan)
1132 break; 1132 break;
1133 1133
1134 if (unlikely(!PageCgroupUsed(pc))) 1134 if (unlikely(!PageCgroupUsed(pc)))
1135 continue; 1135 continue;
1136 1136
1137 page = lookup_cgroup_page(pc); 1137 page = lookup_cgroup_page(pc);
1138 1138
1139 if (unlikely(!PageLRU(page))) 1139 if (unlikely(!PageLRU(page)))
1140 continue; 1140 continue;
1141 1141
1142 scan++; 1142 scan++;
1143 ret = __isolate_lru_page(page, mode, file); 1143 ret = __isolate_lru_page(page, mode, file);
1144 switch (ret) { 1144 switch (ret) {
1145 case 0: 1145 case 0:
1146 list_move(&page->lru, dst); 1146 list_move(&page->lru, dst);
1147 mem_cgroup_del_lru(page); 1147 mem_cgroup_del_lru(page);
1148 nr_taken += hpage_nr_pages(page); 1148 nr_taken += hpage_nr_pages(page);
1149 break; 1149 break;
1150 case -EBUSY: 1150 case -EBUSY:
1151 /* we don't affect global LRU but rotate in our LRU */ 1151 /* we don't affect global LRU but rotate in our LRU */
1152 mem_cgroup_rotate_lru_list(page, page_lru(page)); 1152 mem_cgroup_rotate_lru_list(page, page_lru(page));
1153 break; 1153 break;
1154 default: 1154 default:
1155 break; 1155 break;
1156 } 1156 }
1157 } 1157 }
1158 1158
1159 *scanned = scan; 1159 *scanned = scan;
1160 1160
1161 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, 1161 trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
1162 0, 0, 0, mode); 1162 0, 0, 0, mode);
1163 1163
1164 return nr_taken; 1164 return nr_taken;
1165 } 1165 }
1166 1166
1167 #define mem_cgroup_from_res_counter(counter, member) \ 1167 #define mem_cgroup_from_res_counter(counter, member) \
1168 container_of(counter, struct mem_cgroup, member) 1168 container_of(counter, struct mem_cgroup, member)
1169 1169
1170 /** 1170 /**
1171 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1171 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1172 * @mem: the memory cgroup 1172 * @mem: the memory cgroup
1173 * 1173 *
1174 * Returns the maximum amount of memory @mem can be charged with, in 1174 * Returns the maximum amount of memory @mem can be charged with, in
1175 * pages. 1175 * pages.
1176 */ 1176 */
1177 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) 1177 static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
1178 { 1178 {
1179 unsigned long long margin; 1179 unsigned long long margin;
1180 1180
1181 margin = res_counter_margin(&mem->res); 1181 margin = res_counter_margin(&mem->res);
1182 if (do_swap_account) 1182 if (do_swap_account)
1183 margin = min(margin, res_counter_margin(&mem->memsw)); 1183 margin = min(margin, res_counter_margin(&mem->memsw));
1184 return margin >> PAGE_SHIFT; 1184 return margin >> PAGE_SHIFT;
1185 } 1185 }
1186 1186
1187 static unsigned int get_swappiness(struct mem_cgroup *memcg) 1187 static unsigned int get_swappiness(struct mem_cgroup *memcg)
1188 { 1188 {
1189 struct cgroup *cgrp = memcg->css.cgroup; 1189 struct cgroup *cgrp = memcg->css.cgroup;
1190 1190
1191 /* root ? */ 1191 /* root ? */
1192 if (cgrp->parent == NULL) 1192 if (cgrp->parent == NULL)
1193 return vm_swappiness; 1193 return vm_swappiness;
1194 1194
1195 return memcg->swappiness; 1195 return memcg->swappiness;
1196 } 1196 }
1197 1197
1198 static void mem_cgroup_start_move(struct mem_cgroup *mem) 1198 static void mem_cgroup_start_move(struct mem_cgroup *mem)
1199 { 1199 {
1200 int cpu; 1200 int cpu;
1201 1201
1202 get_online_cpus(); 1202 get_online_cpus();
1203 spin_lock(&mem->pcp_counter_lock); 1203 spin_lock(&mem->pcp_counter_lock);
1204 for_each_online_cpu(cpu) 1204 for_each_online_cpu(cpu)
1205 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; 1205 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
1206 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; 1206 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
1207 spin_unlock(&mem->pcp_counter_lock); 1207 spin_unlock(&mem->pcp_counter_lock);
1208 put_online_cpus(); 1208 put_online_cpus();
1209 1209
1210 synchronize_rcu(); 1210 synchronize_rcu();
1211 } 1211 }
1212 1212
1213 static void mem_cgroup_end_move(struct mem_cgroup *mem) 1213 static void mem_cgroup_end_move(struct mem_cgroup *mem)
1214 { 1214 {
1215 int cpu; 1215 int cpu;
1216 1216
1217 if (!mem) 1217 if (!mem)
1218 return; 1218 return;
1219 get_online_cpus(); 1219 get_online_cpus();
1220 spin_lock(&mem->pcp_counter_lock); 1220 spin_lock(&mem->pcp_counter_lock);
1221 for_each_online_cpu(cpu) 1221 for_each_online_cpu(cpu)
1222 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; 1222 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
1223 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; 1223 mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
1224 spin_unlock(&mem->pcp_counter_lock); 1224 spin_unlock(&mem->pcp_counter_lock);
1225 put_online_cpus(); 1225 put_online_cpus();
1226 } 1226 }
1227 /* 1227 /*
1228 * 2 routines for checking "mem" is under move_account() or not. 1228 * 2 routines for checking "mem" is under move_account() or not.
1229 * 1229 *
1230 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used 1230 * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
1231 * for avoiding race in accounting. If true, 1231 * for avoiding race in accounting. If true,
1232 * pc->mem_cgroup may be overwritten. 1232 * pc->mem_cgroup may be overwritten.
1233 * 1233 *
1234 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1234 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
1235 * under hierarchy of moving cgroups. This is for 1235 * under hierarchy of moving cgroups. This is for
1236 * waiting at hith-memory prressure caused by "move". 1236 * waiting at hith-memory prressure caused by "move".
1237 */ 1237 */
1238 1238
1239 static bool mem_cgroup_stealed(struct mem_cgroup *mem) 1239 static bool mem_cgroup_stealed(struct mem_cgroup *mem)
1240 { 1240 {
1241 VM_BUG_ON(!rcu_read_lock_held()); 1241 VM_BUG_ON(!rcu_read_lock_held());
1242 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; 1242 return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
1243 } 1243 }
1244 1244
1245 static bool mem_cgroup_under_move(struct mem_cgroup *mem) 1245 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
1246 { 1246 {
1247 struct mem_cgroup *from; 1247 struct mem_cgroup *from;
1248 struct mem_cgroup *to; 1248 struct mem_cgroup *to;
1249 bool ret = false; 1249 bool ret = false;
1250 /* 1250 /*
1251 * Unlike task_move routines, we access mc.to, mc.from not under 1251 * Unlike task_move routines, we access mc.to, mc.from not under
1252 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1252 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1253 */ 1253 */
1254 spin_lock(&mc.lock); 1254 spin_lock(&mc.lock);
1255 from = mc.from; 1255 from = mc.from;
1256 to = mc.to; 1256 to = mc.to;
1257 if (!from) 1257 if (!from)
1258 goto unlock; 1258 goto unlock;
1259 if (from == mem || to == mem 1259 if (from == mem || to == mem
1260 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) 1260 || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
1261 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) 1261 || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
1262 ret = true; 1262 ret = true;
1263 unlock: 1263 unlock:
1264 spin_unlock(&mc.lock); 1264 spin_unlock(&mc.lock);
1265 return ret; 1265 return ret;
1266 } 1266 }
1267 1267
1268 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) 1268 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
1269 { 1269 {
1270 if (mc.moving_task && current != mc.moving_task) { 1270 if (mc.moving_task && current != mc.moving_task) {
1271 if (mem_cgroup_under_move(mem)) { 1271 if (mem_cgroup_under_move(mem)) {
1272 DEFINE_WAIT(wait); 1272 DEFINE_WAIT(wait);
1273 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1273 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1274 /* moving charge context might have finished. */ 1274 /* moving charge context might have finished. */
1275 if (mc.moving_task) 1275 if (mc.moving_task)
1276 schedule(); 1276 schedule();
1277 finish_wait(&mc.waitq, &wait); 1277 finish_wait(&mc.waitq, &wait);
1278 return true; 1278 return true;
1279 } 1279 }
1280 } 1280 }
1281 return false; 1281 return false;
1282 } 1282 }
1283 1283
1284 /** 1284 /**
1285 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. 1285 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
1286 * @memcg: The memory cgroup that went over limit 1286 * @memcg: The memory cgroup that went over limit
1287 * @p: Task that is going to be killed 1287 * @p: Task that is going to be killed
1288 * 1288 *
1289 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1289 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1290 * enabled 1290 * enabled
1291 */ 1291 */
1292 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1292 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1293 { 1293 {
1294 struct cgroup *task_cgrp; 1294 struct cgroup *task_cgrp;
1295 struct cgroup *mem_cgrp; 1295 struct cgroup *mem_cgrp;
1296 /* 1296 /*
1297 * Need a buffer in BSS, can't rely on allocations. The code relies 1297 * Need a buffer in BSS, can't rely on allocations. The code relies
1298 * on the assumption that OOM is serialized for memory controller. 1298 * on the assumption that OOM is serialized for memory controller.
1299 * If this assumption is broken, revisit this code. 1299 * If this assumption is broken, revisit this code.
1300 */ 1300 */
1301 static char memcg_name[PATH_MAX]; 1301 static char memcg_name[PATH_MAX];
1302 int ret; 1302 int ret;
1303 1303
1304 if (!memcg || !p) 1304 if (!memcg || !p)
1305 return; 1305 return;
1306 1306
1307 1307
1308 rcu_read_lock(); 1308 rcu_read_lock();
1309 1309
1310 mem_cgrp = memcg->css.cgroup; 1310 mem_cgrp = memcg->css.cgroup;
1311 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1311 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
1312 1312
1313 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1313 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
1314 if (ret < 0) { 1314 if (ret < 0) {
1315 /* 1315 /*
1316 * Unfortunately, we are unable to convert to a useful name 1316 * Unfortunately, we are unable to convert to a useful name
1317 * But we'll still print out the usage information 1317 * But we'll still print out the usage information
1318 */ 1318 */
1319 rcu_read_unlock(); 1319 rcu_read_unlock();
1320 goto done; 1320 goto done;
1321 } 1321 }
1322 rcu_read_unlock(); 1322 rcu_read_unlock();
1323 1323
1324 printk(KERN_INFO "Task in %s killed", memcg_name); 1324 printk(KERN_INFO "Task in %s killed", memcg_name);
1325 1325
1326 rcu_read_lock(); 1326 rcu_read_lock();
1327 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1327 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
1328 if (ret < 0) { 1328 if (ret < 0) {
1329 rcu_read_unlock(); 1329 rcu_read_unlock();
1330 goto done; 1330 goto done;
1331 } 1331 }
1332 rcu_read_unlock(); 1332 rcu_read_unlock();
1333 1333
1334 /* 1334 /*
1335 * Continues from above, so we don't need an KERN_ level 1335 * Continues from above, so we don't need an KERN_ level
1336 */ 1336 */
1337 printk(KERN_CONT " as a result of limit of %s\n", memcg_name); 1337 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
1338 done: 1338 done:
1339 1339
1340 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", 1340 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
1341 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1341 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
1342 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1342 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
1343 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1343 res_counter_read_u64(&memcg->res, RES_FAILCNT));
1344 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " 1344 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
1345 "failcnt %llu\n", 1345 "failcnt %llu\n",
1346 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1346 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
1347 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1347 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
1348 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1348 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
1349 } 1349 }
1350 1350
1351 /* 1351 /*
1352 * This function returns the number of memcg under hierarchy tree. Returns 1352 * This function returns the number of memcg under hierarchy tree. Returns
1353 * 1(self count) if no children. 1353 * 1(self count) if no children.
1354 */ 1354 */
1355 static int mem_cgroup_count_children(struct mem_cgroup *mem) 1355 static int mem_cgroup_count_children(struct mem_cgroup *mem)
1356 { 1356 {
1357 int num = 0; 1357 int num = 0;
1358 struct mem_cgroup *iter; 1358 struct mem_cgroup *iter;
1359 1359
1360 for_each_mem_cgroup_tree(iter, mem) 1360 for_each_mem_cgroup_tree(iter, mem)
1361 num++; 1361 num++;
1362 return num; 1362 return num;
1363 } 1363 }
1364 1364
1365 /* 1365 /*
1366 * Return the memory (and swap, if configured) limit for a memcg. 1366 * Return the memory (and swap, if configured) limit for a memcg.
1367 */ 1367 */
1368 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1368 u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1369 { 1369 {
1370 u64 limit; 1370 u64 limit;
1371 u64 memsw; 1371 u64 memsw;
1372 1372
1373 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1373 limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1374 limit += total_swap_pages << PAGE_SHIFT; 1374 limit += total_swap_pages << PAGE_SHIFT;
1375 1375
1376 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1376 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1377 /* 1377 /*
1378 * If memsw is finite and limits the amount of swap space available 1378 * If memsw is finite and limits the amount of swap space available
1379 * to this memcg, return that limit. 1379 * to this memcg, return that limit.
1380 */ 1380 */
1381 return min(limit, memsw); 1381 return min(limit, memsw);
1382 } 1382 }
1383 1383
1384 /* 1384 /*
1385 * Visit the first child (need not be the first child as per the ordering 1385 * Visit the first child (need not be the first child as per the ordering
1386 * of the cgroup list, since we track last_scanned_child) of @mem and use 1386 * of the cgroup list, since we track last_scanned_child) of @mem and use
1387 * that to reclaim free pages from. 1387 * that to reclaim free pages from.
1388 */ 1388 */
1389 static struct mem_cgroup * 1389 static struct mem_cgroup *
1390 mem_cgroup_select_victim(struct mem_cgroup *root_mem) 1390 mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1391 { 1391 {
1392 struct mem_cgroup *ret = NULL; 1392 struct mem_cgroup *ret = NULL;
1393 struct cgroup_subsys_state *css; 1393 struct cgroup_subsys_state *css;
1394 int nextid, found; 1394 int nextid, found;
1395 1395
1396 if (!root_mem->use_hierarchy) { 1396 if (!root_mem->use_hierarchy) {
1397 css_get(&root_mem->css); 1397 css_get(&root_mem->css);
1398 ret = root_mem; 1398 ret = root_mem;
1399 } 1399 }
1400 1400
1401 while (!ret) { 1401 while (!ret) {
1402 rcu_read_lock(); 1402 rcu_read_lock();
1403 nextid = root_mem->last_scanned_child + 1; 1403 nextid = root_mem->last_scanned_child + 1;
1404 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, 1404 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
1405 &found); 1405 &found);
1406 if (css && css_tryget(css)) 1406 if (css && css_tryget(css))
1407 ret = container_of(css, struct mem_cgroup, css); 1407 ret = container_of(css, struct mem_cgroup, css);
1408 1408
1409 rcu_read_unlock(); 1409 rcu_read_unlock();
1410 /* Updates scanning parameter */ 1410 /* Updates scanning parameter */
1411 if (!css) { 1411 if (!css) {
1412 /* this means start scan from ID:1 */ 1412 /* this means start scan from ID:1 */
1413 root_mem->last_scanned_child = 0; 1413 root_mem->last_scanned_child = 0;
1414 } else 1414 } else
1415 root_mem->last_scanned_child = found; 1415 root_mem->last_scanned_child = found;
1416 } 1416 }
1417 1417
1418 return ret; 1418 return ret;
1419 } 1419 }
1420 1420
1421 /* 1421 /*
1422 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1422 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1423 * we reclaimed from, so that we don't end up penalizing one child extensively 1423 * we reclaimed from, so that we don't end up penalizing one child extensively
1424 * based on its position in the children list. 1424 * based on its position in the children list.
1425 * 1425 *
1426 * root_mem is the original ancestor that we've been reclaim from. 1426 * root_mem is the original ancestor that we've been reclaim from.
1427 * 1427 *
1428 * We give up and return to the caller when we visit root_mem twice. 1428 * We give up and return to the caller when we visit root_mem twice.
1429 * (other groups can be removed while we're walking....) 1429 * (other groups can be removed while we're walking....)
1430 * 1430 *
1431 * If shrink==true, for avoiding to free too much, this returns immedieately. 1431 * If shrink==true, for avoiding to free too much, this returns immedieately.
1432 */ 1432 */
1433 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1433 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
1434 struct zone *zone, 1434 struct zone *zone,
1435 gfp_t gfp_mask, 1435 gfp_t gfp_mask,
1436 unsigned long reclaim_options, 1436 unsigned long reclaim_options,
1437 unsigned long *total_scanned) 1437 unsigned long *total_scanned)
1438 { 1438 {
1439 struct mem_cgroup *victim; 1439 struct mem_cgroup *victim;
1440 int ret, total = 0; 1440 int ret, total = 0;
1441 int loop = 0; 1441 int loop = 0;
1442 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; 1442 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1443 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; 1443 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1444 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; 1444 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1445 unsigned long excess; 1445 unsigned long excess;
1446 unsigned long nr_scanned; 1446 unsigned long nr_scanned;
1447 1447
1448 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; 1448 excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
1449 1449
1450 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1450 /* If memsw_is_minimum==1, swap-out is of-no-use. */
1451 if (root_mem->memsw_is_minimum) 1451 if (root_mem->memsw_is_minimum)
1452 noswap = true; 1452 noswap = true;
1453 1453
1454 while (1) { 1454 while (1) {
1455 victim = mem_cgroup_select_victim(root_mem); 1455 victim = mem_cgroup_select_victim(root_mem);
1456 if (victim == root_mem) { 1456 if (victim == root_mem) {
1457 loop++; 1457 loop++;
1458 if (loop >= 1) 1458 if (loop >= 1)
1459 drain_all_stock_async(); 1459 drain_all_stock_async();
1460 if (loop >= 2) { 1460 if (loop >= 2) {
1461 /* 1461 /*
1462 * If we have not been able to reclaim 1462 * If we have not been able to reclaim
1463 * anything, it might because there are 1463 * anything, it might because there are
1464 * no reclaimable pages under this hierarchy 1464 * no reclaimable pages under this hierarchy
1465 */ 1465 */
1466 if (!check_soft || !total) { 1466 if (!check_soft || !total) {
1467 css_put(&victim->css); 1467 css_put(&victim->css);
1468 break; 1468 break;
1469 } 1469 }
1470 /* 1470 /*
1471 * We want to do more targeted reclaim. 1471 * We want to do more targeted reclaim.
1472 * excess >> 2 is not to excessive so as to 1472 * excess >> 2 is not to excessive so as to
1473 * reclaim too much, nor too less that we keep 1473 * reclaim too much, nor too less that we keep
1474 * coming back to reclaim from this cgroup 1474 * coming back to reclaim from this cgroup
1475 */ 1475 */
1476 if (total >= (excess >> 2) || 1476 if (total >= (excess >> 2) ||
1477 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { 1477 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1478 css_put(&victim->css); 1478 css_put(&victim->css);
1479 break; 1479 break;
1480 } 1480 }
1481 } 1481 }
1482 } 1482 }
1483 if (!mem_cgroup_local_usage(victim)) { 1483 if (!mem_cgroup_local_usage(victim)) {
1484 /* this cgroup's local usage == 0 */ 1484 /* this cgroup's local usage == 0 */
1485 css_put(&victim->css); 1485 css_put(&victim->css);
1486 continue; 1486 continue;
1487 } 1487 }
1488 /* we use swappiness of local cgroup */ 1488 /* we use swappiness of local cgroup */
1489 if (check_soft) { 1489 if (check_soft) {
1490 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, 1490 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1491 noswap, get_swappiness(victim), zone, 1491 noswap, get_swappiness(victim), zone,
1492 &nr_scanned); 1492 &nr_scanned);
1493 *total_scanned += nr_scanned; 1493 *total_scanned += nr_scanned;
1494 } else 1494 } else
1495 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, 1495 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1496 noswap, get_swappiness(victim)); 1496 noswap, get_swappiness(victim));
1497 css_put(&victim->css); 1497 css_put(&victim->css);
1498 /* 1498 /*
1499 * At shrinking usage, we can't check we should stop here or 1499 * At shrinking usage, we can't check we should stop here or
1500 * reclaim more. It's depends on callers. last_scanned_child 1500 * reclaim more. It's depends on callers. last_scanned_child
1501 * will work enough for keeping fairness under tree. 1501 * will work enough for keeping fairness under tree.
1502 */ 1502 */
1503 if (shrink) 1503 if (shrink)
1504 return ret; 1504 return ret;
1505 total += ret; 1505 total += ret;
1506 if (check_soft) { 1506 if (check_soft) {
1507 if (!res_counter_soft_limit_excess(&root_mem->res)) 1507 if (!res_counter_soft_limit_excess(&root_mem->res))
1508 return total; 1508 return total;
1509 } else if (mem_cgroup_margin(root_mem)) 1509 } else if (mem_cgroup_margin(root_mem))
1510 return 1 + total; 1510 return 1 + total;
1511 } 1511 }
1512 return total; 1512 return total;
1513 } 1513 }
1514 1514
1515 /* 1515 /*
1516 * Check OOM-Killer is already running under our hierarchy. 1516 * Check OOM-Killer is already running under our hierarchy.
1517 * If someone is running, return false. 1517 * If someone is running, return false.
1518 */ 1518 */
1519 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) 1519 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
1520 { 1520 {
1521 int x, lock_count = 0; 1521 int x, lock_count = 0;
1522 struct mem_cgroup *iter; 1522 struct mem_cgroup *iter;
1523 1523
1524 for_each_mem_cgroup_tree(iter, mem) { 1524 for_each_mem_cgroup_tree(iter, mem) {
1525 x = atomic_inc_return(&iter->oom_lock); 1525 x = atomic_inc_return(&iter->oom_lock);
1526 lock_count = max(x, lock_count); 1526 lock_count = max(x, lock_count);
1527 } 1527 }
1528 1528
1529 if (lock_count == 1) 1529 if (lock_count == 1)
1530 return true; 1530 return true;
1531 return false; 1531 return false;
1532 } 1532 }
1533 1533
1534 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) 1534 static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
1535 { 1535 {
1536 struct mem_cgroup *iter; 1536 struct mem_cgroup *iter;
1537 1537
1538 /* 1538 /*
1539 * When a new child is created while the hierarchy is under oom, 1539 * When a new child is created while the hierarchy is under oom,
1540 * mem_cgroup_oom_lock() may not be called. We have to use 1540 * mem_cgroup_oom_lock() may not be called. We have to use
1541 * atomic_add_unless() here. 1541 * atomic_add_unless() here.
1542 */ 1542 */
1543 for_each_mem_cgroup_tree(iter, mem) 1543 for_each_mem_cgroup_tree(iter, mem)
1544 atomic_add_unless(&iter->oom_lock, -1, 0); 1544 atomic_add_unless(&iter->oom_lock, -1, 0);
1545 return 0; 1545 return 0;
1546 } 1546 }
1547 1547
1548 1548
1549 static DEFINE_MUTEX(memcg_oom_mutex); 1549 static DEFINE_MUTEX(memcg_oom_mutex);
1550 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 1550 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1551 1551
1552 struct oom_wait_info { 1552 struct oom_wait_info {
1553 struct mem_cgroup *mem; 1553 struct mem_cgroup *mem;
1554 wait_queue_t wait; 1554 wait_queue_t wait;
1555 }; 1555 };
1556 1556
1557 static int memcg_oom_wake_function(wait_queue_t *wait, 1557 static int memcg_oom_wake_function(wait_queue_t *wait,
1558 unsigned mode, int sync, void *arg) 1558 unsigned mode, int sync, void *arg)
1559 { 1559 {
1560 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; 1560 struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg;
1561 struct oom_wait_info *oom_wait_info; 1561 struct oom_wait_info *oom_wait_info;
1562 1562
1563 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 1563 oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1564 1564
1565 if (oom_wait_info->mem == wake_mem) 1565 if (oom_wait_info->mem == wake_mem)
1566 goto wakeup; 1566 goto wakeup;
1567 /* if no hierarchy, no match */ 1567 /* if no hierarchy, no match */
1568 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) 1568 if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy)
1569 return 0; 1569 return 0;
1570 /* 1570 /*
1571 * Both of oom_wait_info->mem and wake_mem are stable under us. 1571 * Both of oom_wait_info->mem and wake_mem are stable under us.
1572 * Then we can use css_is_ancestor without taking care of RCU. 1572 * Then we can use css_is_ancestor without taking care of RCU.
1573 */ 1573 */
1574 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && 1574 if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) &&
1575 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) 1575 !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css))
1576 return 0; 1576 return 0;
1577 1577
1578 wakeup: 1578 wakeup:
1579 return autoremove_wake_function(wait, mode, sync, arg); 1579 return autoremove_wake_function(wait, mode, sync, arg);
1580 } 1580 }
1581 1581
1582 static void memcg_wakeup_oom(struct mem_cgroup *mem) 1582 static void memcg_wakeup_oom(struct mem_cgroup *mem)
1583 { 1583 {
1584 /* for filtering, pass "mem" as argument. */ 1584 /* for filtering, pass "mem" as argument. */
1585 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); 1585 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
1586 } 1586 }
1587 1587
1588 static void memcg_oom_recover(struct mem_cgroup *mem) 1588 static void memcg_oom_recover(struct mem_cgroup *mem)
1589 { 1589 {
1590 if (mem && atomic_read(&mem->oom_lock)) 1590 if (mem && atomic_read(&mem->oom_lock))
1591 memcg_wakeup_oom(mem); 1591 memcg_wakeup_oom(mem);
1592 } 1592 }
1593 1593
1594 /* 1594 /*
1595 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1595 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1596 */ 1596 */
1597 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) 1597 bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
1598 { 1598 {
1599 struct oom_wait_info owait; 1599 struct oom_wait_info owait;
1600 bool locked, need_to_kill; 1600 bool locked, need_to_kill;
1601 1601
1602 owait.mem = mem; 1602 owait.mem = mem;
1603 owait.wait.flags = 0; 1603 owait.wait.flags = 0;
1604 owait.wait.func = memcg_oom_wake_function; 1604 owait.wait.func = memcg_oom_wake_function;
1605 owait.wait.private = current; 1605 owait.wait.private = current;
1606 INIT_LIST_HEAD(&owait.wait.task_list); 1606 INIT_LIST_HEAD(&owait.wait.task_list);
1607 need_to_kill = true; 1607 need_to_kill = true;
1608 /* At first, try to OOM lock hierarchy under mem.*/ 1608 /* At first, try to OOM lock hierarchy under mem.*/
1609 mutex_lock(&memcg_oom_mutex); 1609 mutex_lock(&memcg_oom_mutex);
1610 locked = mem_cgroup_oom_lock(mem); 1610 locked = mem_cgroup_oom_lock(mem);
1611 /* 1611 /*
1612 * Even if signal_pending(), we can't quit charge() loop without 1612 * Even if signal_pending(), we can't quit charge() loop without
1613 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL 1613 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
1614 * under OOM is always welcomed, use TASK_KILLABLE here. 1614 * under OOM is always welcomed, use TASK_KILLABLE here.
1615 */ 1615 */
1616 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 1616 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1617 if (!locked || mem->oom_kill_disable) 1617 if (!locked || mem->oom_kill_disable)
1618 need_to_kill = false; 1618 need_to_kill = false;
1619 if (locked) 1619 if (locked)
1620 mem_cgroup_oom_notify(mem); 1620 mem_cgroup_oom_notify(mem);
1621 mutex_unlock(&memcg_oom_mutex); 1621 mutex_unlock(&memcg_oom_mutex);
1622 1622
1623 if (need_to_kill) { 1623 if (need_to_kill) {
1624 finish_wait(&memcg_oom_waitq, &owait.wait); 1624 finish_wait(&memcg_oom_waitq, &owait.wait);
1625 mem_cgroup_out_of_memory(mem, mask); 1625 mem_cgroup_out_of_memory(mem, mask);
1626 } else { 1626 } else {
1627 schedule(); 1627 schedule();
1628 finish_wait(&memcg_oom_waitq, &owait.wait); 1628 finish_wait(&memcg_oom_waitq, &owait.wait);
1629 } 1629 }
1630 mutex_lock(&memcg_oom_mutex); 1630 mutex_lock(&memcg_oom_mutex);
1631 mem_cgroup_oom_unlock(mem); 1631 mem_cgroup_oom_unlock(mem);
1632 memcg_wakeup_oom(mem); 1632 memcg_wakeup_oom(mem);
1633 mutex_unlock(&memcg_oom_mutex); 1633 mutex_unlock(&memcg_oom_mutex);
1634 1634
1635 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 1635 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
1636 return false; 1636 return false;
1637 /* Give chance to dying process */ 1637 /* Give chance to dying process */
1638 schedule_timeout(1); 1638 schedule_timeout(1);
1639 return true; 1639 return true;
1640 } 1640 }
1641 1641
1642 /* 1642 /*
1643 * Currently used to update mapped file statistics, but the routine can be 1643 * Currently used to update mapped file statistics, but the routine can be
1644 * generalized to update other statistics as well. 1644 * generalized to update other statistics as well.
1645 * 1645 *
1646 * Notes: Race condition 1646 * Notes: Race condition
1647 * 1647 *
1648 * We usually use page_cgroup_lock() for accessing page_cgroup member but 1648 * We usually use page_cgroup_lock() for accessing page_cgroup member but
1649 * it tends to be costly. But considering some conditions, we doesn't need 1649 * it tends to be costly. But considering some conditions, we doesn't need
1650 * to do so _always_. 1650 * to do so _always_.
1651 * 1651 *
1652 * Considering "charge", lock_page_cgroup() is not required because all 1652 * Considering "charge", lock_page_cgroup() is not required because all
1653 * file-stat operations happen after a page is attached to radix-tree. There 1653 * file-stat operations happen after a page is attached to radix-tree. There
1654 * are no race with "charge". 1654 * are no race with "charge".
1655 * 1655 *
1656 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 1656 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
1657 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 1657 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
1658 * if there are race with "uncharge". Statistics itself is properly handled 1658 * if there are race with "uncharge". Statistics itself is properly handled
1659 * by flags. 1659 * by flags.
1660 * 1660 *
1661 * Considering "move", this is an only case we see a race. To make the race 1661 * Considering "move", this is an only case we see a race. To make the race
1662 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are 1662 * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
1663 * possibility of race condition. If there is, we take a lock. 1663 * possibility of race condition. If there is, we take a lock.
1664 */ 1664 */
1665 1665
1666 void mem_cgroup_update_page_stat(struct page *page, 1666 void mem_cgroup_update_page_stat(struct page *page,
1667 enum mem_cgroup_page_stat_item idx, int val) 1667 enum mem_cgroup_page_stat_item idx, int val)
1668 { 1668 {
1669 struct mem_cgroup *mem; 1669 struct mem_cgroup *mem;
1670 struct page_cgroup *pc = lookup_page_cgroup(page); 1670 struct page_cgroup *pc = lookup_page_cgroup(page);
1671 bool need_unlock = false; 1671 bool need_unlock = false;
1672 unsigned long uninitialized_var(flags); 1672 unsigned long uninitialized_var(flags);
1673 1673
1674 if (unlikely(!pc)) 1674 if (unlikely(!pc))
1675 return; 1675 return;
1676 1676
1677 rcu_read_lock(); 1677 rcu_read_lock();
1678 mem = pc->mem_cgroup; 1678 mem = pc->mem_cgroup;
1679 if (unlikely(!mem || !PageCgroupUsed(pc))) 1679 if (unlikely(!mem || !PageCgroupUsed(pc)))
1680 goto out; 1680 goto out;
1681 /* pc->mem_cgroup is unstable ? */ 1681 /* pc->mem_cgroup is unstable ? */
1682 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { 1682 if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
1683 /* take a lock against to access pc->mem_cgroup */ 1683 /* take a lock against to access pc->mem_cgroup */
1684 move_lock_page_cgroup(pc, &flags); 1684 move_lock_page_cgroup(pc, &flags);
1685 need_unlock = true; 1685 need_unlock = true;
1686 mem = pc->mem_cgroup; 1686 mem = pc->mem_cgroup;
1687 if (!mem || !PageCgroupUsed(pc)) 1687 if (!mem || !PageCgroupUsed(pc))
1688 goto out; 1688 goto out;
1689 } 1689 }
1690 1690
1691 switch (idx) { 1691 switch (idx) {
1692 case MEMCG_NR_FILE_MAPPED: 1692 case MEMCG_NR_FILE_MAPPED:
1693 if (val > 0) 1693 if (val > 0)
1694 SetPageCgroupFileMapped(pc); 1694 SetPageCgroupFileMapped(pc);
1695 else if (!page_mapped(page)) 1695 else if (!page_mapped(page))
1696 ClearPageCgroupFileMapped(pc); 1696 ClearPageCgroupFileMapped(pc);
1697 idx = MEM_CGROUP_STAT_FILE_MAPPED; 1697 idx = MEM_CGROUP_STAT_FILE_MAPPED;
1698 break; 1698 break;
1699 default: 1699 default:
1700 BUG(); 1700 BUG();
1701 } 1701 }
1702 1702
1703 this_cpu_add(mem->stat->count[idx], val); 1703 this_cpu_add(mem->stat->count[idx], val);
1704 1704
1705 out: 1705 out:
1706 if (unlikely(need_unlock)) 1706 if (unlikely(need_unlock))
1707 move_unlock_page_cgroup(pc, &flags); 1707 move_unlock_page_cgroup(pc, &flags);
1708 rcu_read_unlock(); 1708 rcu_read_unlock();
1709 return; 1709 return;
1710 } 1710 }
1711 EXPORT_SYMBOL(mem_cgroup_update_page_stat); 1711 EXPORT_SYMBOL(mem_cgroup_update_page_stat);
1712 1712
1713 /* 1713 /*
1714 * size of first charge trial. "32" comes from vmscan.c's magic value. 1714 * size of first charge trial. "32" comes from vmscan.c's magic value.
1715 * TODO: maybe necessary to use big numbers in big irons. 1715 * TODO: maybe necessary to use big numbers in big irons.
1716 */ 1716 */
1717 #define CHARGE_BATCH 32U 1717 #define CHARGE_BATCH 32U
1718 struct memcg_stock_pcp { 1718 struct memcg_stock_pcp {
1719 struct mem_cgroup *cached; /* this never be root cgroup */ 1719 struct mem_cgroup *cached; /* this never be root cgroup */
1720 unsigned int nr_pages; 1720 unsigned int nr_pages;
1721 struct work_struct work; 1721 struct work_struct work;
1722 }; 1722 };
1723 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1723 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1724 static atomic_t memcg_drain_count; 1724 static atomic_t memcg_drain_count;
1725 1725
1726 /* 1726 /*
1727 * Try to consume stocked charge on this cpu. If success, one page is consumed 1727 * Try to consume stocked charge on this cpu. If success, one page is consumed
1728 * from local stock and true is returned. If the stock is 0 or charges from a 1728 * from local stock and true is returned. If the stock is 0 or charges from a
1729 * cgroup which is not current target, returns false. This stock will be 1729 * cgroup which is not current target, returns false. This stock will be
1730 * refilled. 1730 * refilled.
1731 */ 1731 */
1732 static bool consume_stock(struct mem_cgroup *mem) 1732 static bool consume_stock(struct mem_cgroup *mem)
1733 { 1733 {
1734 struct memcg_stock_pcp *stock; 1734 struct memcg_stock_pcp *stock;
1735 bool ret = true; 1735 bool ret = true;
1736 1736
1737 stock = &get_cpu_var(memcg_stock); 1737 stock = &get_cpu_var(memcg_stock);
1738 if (mem == stock->cached && stock->nr_pages) 1738 if (mem == stock->cached && stock->nr_pages)
1739 stock->nr_pages--; 1739 stock->nr_pages--;
1740 else /* need to call res_counter_charge */ 1740 else /* need to call res_counter_charge */
1741 ret = false; 1741 ret = false;
1742 put_cpu_var(memcg_stock); 1742 put_cpu_var(memcg_stock);
1743 return ret; 1743 return ret;
1744 } 1744 }
1745 1745
1746 /* 1746 /*
1747 * Returns stocks cached in percpu to res_counter and reset cached information. 1747 * Returns stocks cached in percpu to res_counter and reset cached information.
1748 */ 1748 */
1749 static void drain_stock(struct memcg_stock_pcp *stock) 1749 static void drain_stock(struct memcg_stock_pcp *stock)
1750 { 1750 {
1751 struct mem_cgroup *old = stock->cached; 1751 struct mem_cgroup *old = stock->cached;
1752 1752
1753 if (stock->nr_pages) { 1753 if (stock->nr_pages) {
1754 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 1754 unsigned long bytes = stock->nr_pages * PAGE_SIZE;
1755 1755
1756 res_counter_uncharge(&old->res, bytes); 1756 res_counter_uncharge(&old->res, bytes);
1757 if (do_swap_account) 1757 if (do_swap_account)
1758 res_counter_uncharge(&old->memsw, bytes); 1758 res_counter_uncharge(&old->memsw, bytes);
1759 stock->nr_pages = 0; 1759 stock->nr_pages = 0;
1760 } 1760 }
1761 stock->cached = NULL; 1761 stock->cached = NULL;
1762 } 1762 }
1763 1763
1764 /* 1764 /*
1765 * This must be called under preempt disabled or must be called by 1765 * This must be called under preempt disabled or must be called by
1766 * a thread which is pinned to local cpu. 1766 * a thread which is pinned to local cpu.
1767 */ 1767 */
1768 static void drain_local_stock(struct work_struct *dummy) 1768 static void drain_local_stock(struct work_struct *dummy)
1769 { 1769 {
1770 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 1770 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
1771 drain_stock(stock); 1771 drain_stock(stock);
1772 } 1772 }
1773 1773
1774 /* 1774 /*
1775 * Cache charges(val) which is from res_counter, to local per_cpu area. 1775 * Cache charges(val) which is from res_counter, to local per_cpu area.
1776 * This will be consumed by consume_stock() function, later. 1776 * This will be consumed by consume_stock() function, later.
1777 */ 1777 */
1778 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) 1778 static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
1779 { 1779 {
1780 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 1780 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
1781 1781
1782 if (stock->cached != mem) { /* reset if necessary */ 1782 if (stock->cached != mem) { /* reset if necessary */
1783 drain_stock(stock); 1783 drain_stock(stock);
1784 stock->cached = mem; 1784 stock->cached = mem;
1785 } 1785 }
1786 stock->nr_pages += nr_pages; 1786 stock->nr_pages += nr_pages;
1787 put_cpu_var(memcg_stock); 1787 put_cpu_var(memcg_stock);
1788 } 1788 }
1789 1789
1790 /* 1790 /*
1791 * Tries to drain stocked charges in other cpus. This function is asynchronous 1791 * Tries to drain stocked charges in other cpus. This function is asynchronous
1792 * and just put a work per cpu for draining localy on each cpu. Caller can 1792 * and just put a work per cpu for draining localy on each cpu. Caller can
1793 * expects some charges will be back to res_counter later but cannot wait for 1793 * expects some charges will be back to res_counter later but cannot wait for
1794 * it. 1794 * it.
1795 */ 1795 */
1796 static void drain_all_stock_async(void) 1796 static void drain_all_stock_async(void)
1797 { 1797 {
1798 int cpu; 1798 int cpu;
1799 /* This function is for scheduling "drain" in asynchronous way. 1799 /* This function is for scheduling "drain" in asynchronous way.
1800 * The result of "drain" is not directly handled by callers. Then, 1800 * The result of "drain" is not directly handled by callers. Then,
1801 * if someone is calling drain, we don't have to call drain more. 1801 * if someone is calling drain, we don't have to call drain more.
1802 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if 1802 * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
1803 * there is a race. We just do loose check here. 1803 * there is a race. We just do loose check here.
1804 */ 1804 */
1805 if (atomic_read(&memcg_drain_count)) 1805 if (atomic_read(&memcg_drain_count))
1806 return; 1806 return;
1807 /* Notify other cpus that system-wide "drain" is running */ 1807 /* Notify other cpus that system-wide "drain" is running */
1808 atomic_inc(&memcg_drain_count); 1808 atomic_inc(&memcg_drain_count);
1809 get_online_cpus(); 1809 get_online_cpus();
1810 for_each_online_cpu(cpu) { 1810 for_each_online_cpu(cpu) {
1811 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 1811 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1812 schedule_work_on(cpu, &stock->work); 1812 schedule_work_on(cpu, &stock->work);
1813 } 1813 }
1814 put_online_cpus(); 1814 put_online_cpus();
1815 atomic_dec(&memcg_drain_count); 1815 atomic_dec(&memcg_drain_count);
1816 /* We don't wait for flush_work */ 1816 /* We don't wait for flush_work */
1817 } 1817 }
1818 1818
1819 /* This is a synchronous drain interface. */ 1819 /* This is a synchronous drain interface. */
1820 static void drain_all_stock_sync(void) 1820 static void drain_all_stock_sync(void)
1821 { 1821 {
1822 /* called when force_empty is called */ 1822 /* called when force_empty is called */
1823 atomic_inc(&memcg_drain_count); 1823 atomic_inc(&memcg_drain_count);
1824 schedule_on_each_cpu(drain_local_stock); 1824 schedule_on_each_cpu(drain_local_stock);
1825 atomic_dec(&memcg_drain_count); 1825 atomic_dec(&memcg_drain_count);
1826 } 1826 }
1827 1827
1828 /* 1828 /*
1829 * This function drains percpu counter value from DEAD cpu and 1829 * This function drains percpu counter value from DEAD cpu and
1830 * move it to local cpu. Note that this function can be preempted. 1830 * move it to local cpu. Note that this function can be preempted.
1831 */ 1831 */
1832 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) 1832 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
1833 { 1833 {
1834 int i; 1834 int i;
1835 1835
1836 spin_lock(&mem->pcp_counter_lock); 1836 spin_lock(&mem->pcp_counter_lock);
1837 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 1837 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
1838 long x = per_cpu(mem->stat->count[i], cpu); 1838 long x = per_cpu(mem->stat->count[i], cpu);
1839 1839
1840 per_cpu(mem->stat->count[i], cpu) = 0; 1840 per_cpu(mem->stat->count[i], cpu) = 0;
1841 mem->nocpu_base.count[i] += x; 1841 mem->nocpu_base.count[i] += x;
1842 } 1842 }
1843 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 1843 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
1844 unsigned long x = per_cpu(mem->stat->events[i], cpu); 1844 unsigned long x = per_cpu(mem->stat->events[i], cpu);
1845 1845
1846 per_cpu(mem->stat->events[i], cpu) = 0; 1846 per_cpu(mem->stat->events[i], cpu) = 0;
1847 mem->nocpu_base.events[i] += x; 1847 mem->nocpu_base.events[i] += x;
1848 } 1848 }
1849 /* need to clear ON_MOVE value, works as a kind of lock. */ 1849 /* need to clear ON_MOVE value, works as a kind of lock. */
1850 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; 1850 per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
1851 spin_unlock(&mem->pcp_counter_lock); 1851 spin_unlock(&mem->pcp_counter_lock);
1852 } 1852 }
1853 1853
1854 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) 1854 static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
1855 { 1855 {
1856 int idx = MEM_CGROUP_ON_MOVE; 1856 int idx = MEM_CGROUP_ON_MOVE;
1857 1857
1858 spin_lock(&mem->pcp_counter_lock); 1858 spin_lock(&mem->pcp_counter_lock);
1859 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; 1859 per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
1860 spin_unlock(&mem->pcp_counter_lock); 1860 spin_unlock(&mem->pcp_counter_lock);
1861 } 1861 }
1862 1862
1863 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, 1863 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
1864 unsigned long action, 1864 unsigned long action,
1865 void *hcpu) 1865 void *hcpu)
1866 { 1866 {
1867 int cpu = (unsigned long)hcpu; 1867 int cpu = (unsigned long)hcpu;
1868 struct memcg_stock_pcp *stock; 1868 struct memcg_stock_pcp *stock;
1869 struct mem_cgroup *iter; 1869 struct mem_cgroup *iter;
1870 1870
1871 if ((action == CPU_ONLINE)) { 1871 if ((action == CPU_ONLINE)) {
1872 for_each_mem_cgroup_all(iter) 1872 for_each_mem_cgroup_all(iter)
1873 synchronize_mem_cgroup_on_move(iter, cpu); 1873 synchronize_mem_cgroup_on_move(iter, cpu);
1874 return NOTIFY_OK; 1874 return NOTIFY_OK;
1875 } 1875 }
1876 1876
1877 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) 1877 if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
1878 return NOTIFY_OK; 1878 return NOTIFY_OK;
1879 1879
1880 for_each_mem_cgroup_all(iter) 1880 for_each_mem_cgroup_all(iter)
1881 mem_cgroup_drain_pcp_counter(iter, cpu); 1881 mem_cgroup_drain_pcp_counter(iter, cpu);
1882 1882
1883 stock = &per_cpu(memcg_stock, cpu); 1883 stock = &per_cpu(memcg_stock, cpu);
1884 drain_stock(stock); 1884 drain_stock(stock);
1885 return NOTIFY_OK; 1885 return NOTIFY_OK;
1886 } 1886 }
1887 1887
1888 1888
1889 /* See __mem_cgroup_try_charge() for details */ 1889 /* See __mem_cgroup_try_charge() for details */
1890 enum { 1890 enum {
1891 CHARGE_OK, /* success */ 1891 CHARGE_OK, /* success */
1892 CHARGE_RETRY, /* need to retry but retry is not bad */ 1892 CHARGE_RETRY, /* need to retry but retry is not bad */
1893 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 1893 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
1894 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 1894 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
1895 CHARGE_OOM_DIE, /* the current is killed because of OOM */ 1895 CHARGE_OOM_DIE, /* the current is killed because of OOM */
1896 }; 1896 };
1897 1897
1898 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, 1898 static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
1899 unsigned int nr_pages, bool oom_check) 1899 unsigned int nr_pages, bool oom_check)
1900 { 1900 {
1901 unsigned long csize = nr_pages * PAGE_SIZE; 1901 unsigned long csize = nr_pages * PAGE_SIZE;
1902 struct mem_cgroup *mem_over_limit; 1902 struct mem_cgroup *mem_over_limit;
1903 struct res_counter *fail_res; 1903 struct res_counter *fail_res;
1904 unsigned long flags = 0; 1904 unsigned long flags = 0;
1905 int ret; 1905 int ret;
1906 1906
1907 ret = res_counter_charge(&mem->res, csize, &fail_res); 1907 ret = res_counter_charge(&mem->res, csize, &fail_res);
1908 1908
1909 if (likely(!ret)) { 1909 if (likely(!ret)) {
1910 if (!do_swap_account) 1910 if (!do_swap_account)
1911 return CHARGE_OK; 1911 return CHARGE_OK;
1912 ret = res_counter_charge(&mem->memsw, csize, &fail_res); 1912 ret = res_counter_charge(&mem->memsw, csize, &fail_res);
1913 if (likely(!ret)) 1913 if (likely(!ret))
1914 return CHARGE_OK; 1914 return CHARGE_OK;
1915 1915
1916 res_counter_uncharge(&mem->res, csize); 1916 res_counter_uncharge(&mem->res, csize);
1917 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 1917 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
1918 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 1918 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1919 } else 1919 } else
1920 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 1920 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
1921 /* 1921 /*
1922 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch 1922 * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
1923 * of regular pages (CHARGE_BATCH), or a single regular page (1). 1923 * of regular pages (CHARGE_BATCH), or a single regular page (1).
1924 * 1924 *
1925 * Never reclaim on behalf of optional batching, retry with a 1925 * Never reclaim on behalf of optional batching, retry with a
1926 * single page instead. 1926 * single page instead.
1927 */ 1927 */
1928 if (nr_pages == CHARGE_BATCH) 1928 if (nr_pages == CHARGE_BATCH)
1929 return CHARGE_RETRY; 1929 return CHARGE_RETRY;
1930 1930
1931 if (!(gfp_mask & __GFP_WAIT)) 1931 if (!(gfp_mask & __GFP_WAIT))
1932 return CHARGE_WOULDBLOCK; 1932 return CHARGE_WOULDBLOCK;
1933 1933
1934 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, 1934 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1935 gfp_mask, flags, NULL); 1935 gfp_mask, flags, NULL);
1936 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 1936 if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1937 return CHARGE_RETRY; 1937 return CHARGE_RETRY;
1938 /* 1938 /*
1939 * Even though the limit is exceeded at this point, reclaim 1939 * Even though the limit is exceeded at this point, reclaim
1940 * may have been able to free some pages. Retry the charge 1940 * may have been able to free some pages. Retry the charge
1941 * before killing the task. 1941 * before killing the task.
1942 * 1942 *
1943 * Only for regular pages, though: huge pages are rather 1943 * Only for regular pages, though: huge pages are rather
1944 * unlikely to succeed so close to the limit, and we fall back 1944 * unlikely to succeed so close to the limit, and we fall back
1945 * to regular pages anyway in case of failure. 1945 * to regular pages anyway in case of failure.
1946 */ 1946 */
1947 if (nr_pages == 1 && ret) 1947 if (nr_pages == 1 && ret)
1948 return CHARGE_RETRY; 1948 return CHARGE_RETRY;
1949 1949
1950 /* 1950 /*
1951 * At task move, charge accounts can be doubly counted. So, it's 1951 * At task move, charge accounts can be doubly counted. So, it's
1952 * better to wait until the end of task_move if something is going on. 1952 * better to wait until the end of task_move if something is going on.
1953 */ 1953 */
1954 if (mem_cgroup_wait_acct_move(mem_over_limit)) 1954 if (mem_cgroup_wait_acct_move(mem_over_limit))
1955 return CHARGE_RETRY; 1955 return CHARGE_RETRY;
1956 1956
1957 /* If we don't need to call oom-killer at el, return immediately */ 1957 /* If we don't need to call oom-killer at el, return immediately */
1958 if (!oom_check) 1958 if (!oom_check)
1959 return CHARGE_NOMEM; 1959 return CHARGE_NOMEM;
1960 /* check OOM */ 1960 /* check OOM */
1961 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) 1961 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
1962 return CHARGE_OOM_DIE; 1962 return CHARGE_OOM_DIE;
1963 1963
1964 return CHARGE_RETRY; 1964 return CHARGE_RETRY;
1965 } 1965 }
1966 1966
1967 /* 1967 /*
1968 * Unlike exported interface, "oom" parameter is added. if oom==true, 1968 * Unlike exported interface, "oom" parameter is added. if oom==true,
1969 * oom-killer can be invoked. 1969 * oom-killer can be invoked.
1970 */ 1970 */
1971 static int __mem_cgroup_try_charge(struct mm_struct *mm, 1971 static int __mem_cgroup_try_charge(struct mm_struct *mm,
1972 gfp_t gfp_mask, 1972 gfp_t gfp_mask,
1973 unsigned int nr_pages, 1973 unsigned int nr_pages,
1974 struct mem_cgroup **memcg, 1974 struct mem_cgroup **memcg,
1975 bool oom) 1975 bool oom)
1976 { 1976 {
1977 unsigned int batch = max(CHARGE_BATCH, nr_pages); 1977 unsigned int batch = max(CHARGE_BATCH, nr_pages);
1978 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 1978 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
1979 struct mem_cgroup *mem = NULL; 1979 struct mem_cgroup *mem = NULL;
1980 int ret; 1980 int ret;
1981 1981
1982 /* 1982 /*
1983 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 1983 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
1984 * in system level. So, allow to go ahead dying process in addition to 1984 * in system level. So, allow to go ahead dying process in addition to
1985 * MEMDIE process. 1985 * MEMDIE process.
1986 */ 1986 */
1987 if (unlikely(test_thread_flag(TIF_MEMDIE) 1987 if (unlikely(test_thread_flag(TIF_MEMDIE)
1988 || fatal_signal_pending(current))) 1988 || fatal_signal_pending(current)))
1989 goto bypass; 1989 goto bypass;
1990 1990
1991 /* 1991 /*
1992 * We always charge the cgroup the mm_struct belongs to. 1992 * We always charge the cgroup the mm_struct belongs to.
1993 * The mm_struct's mem_cgroup changes on task migration if the 1993 * The mm_struct's mem_cgroup changes on task migration if the
1994 * thread group leader migrates. It's possible that mm is not 1994 * thread group leader migrates. It's possible that mm is not
1995 * set, if so charge the init_mm (happens for pagecache usage). 1995 * set, if so charge the init_mm (happens for pagecache usage).
1996 */ 1996 */
1997 if (!*memcg && !mm) 1997 if (!*memcg && !mm)
1998 goto bypass; 1998 goto bypass;
1999 again: 1999 again:
2000 if (*memcg) { /* css should be a valid one */ 2000 if (*memcg) { /* css should be a valid one */
2001 mem = *memcg; 2001 mem = *memcg;
2002 VM_BUG_ON(css_is_removed(&mem->css)); 2002 VM_BUG_ON(css_is_removed(&mem->css));
2003 if (mem_cgroup_is_root(mem)) 2003 if (mem_cgroup_is_root(mem))
2004 goto done; 2004 goto done;
2005 if (nr_pages == 1 && consume_stock(mem)) 2005 if (nr_pages == 1 && consume_stock(mem))
2006 goto done; 2006 goto done;
2007 css_get(&mem->css); 2007 css_get(&mem->css);
2008 } else { 2008 } else {
2009 struct task_struct *p; 2009 struct task_struct *p;
2010 2010
2011 rcu_read_lock(); 2011 rcu_read_lock();
2012 p = rcu_dereference(mm->owner); 2012 p = rcu_dereference(mm->owner);
2013 /* 2013 /*
2014 * Because we don't have task_lock(), "p" can exit. 2014 * Because we don't have task_lock(), "p" can exit.
2015 * In that case, "mem" can point to root or p can be NULL with 2015 * In that case, "mem" can point to root or p can be NULL with
2016 * race with swapoff. Then, we have small risk of mis-accouning. 2016 * race with swapoff. Then, we have small risk of mis-accouning.
2017 * But such kind of mis-account by race always happens because 2017 * But such kind of mis-account by race always happens because
2018 * we don't have cgroup_mutex(). It's overkill and we allo that 2018 * we don't have cgroup_mutex(). It's overkill and we allo that
2019 * small race, here. 2019 * small race, here.
2020 * (*) swapoff at el will charge against mm-struct not against 2020 * (*) swapoff at el will charge against mm-struct not against
2021 * task-struct. So, mm->owner can be NULL. 2021 * task-struct. So, mm->owner can be NULL.
2022 */ 2022 */
2023 mem = mem_cgroup_from_task(p); 2023 mem = mem_cgroup_from_task(p);
2024 if (!mem || mem_cgroup_is_root(mem)) { 2024 if (!mem || mem_cgroup_is_root(mem)) {
2025 rcu_read_unlock(); 2025 rcu_read_unlock();
2026 goto done; 2026 goto done;
2027 } 2027 }
2028 if (nr_pages == 1 && consume_stock(mem)) { 2028 if (nr_pages == 1 && consume_stock(mem)) {
2029 /* 2029 /*
2030 * It seems dagerous to access memcg without css_get(). 2030 * It seems dagerous to access memcg without css_get().
2031 * But considering how consume_stok works, it's not 2031 * But considering how consume_stok works, it's not
2032 * necessary. If consume_stock success, some charges 2032 * necessary. If consume_stock success, some charges
2033 * from this memcg are cached on this cpu. So, we 2033 * from this memcg are cached on this cpu. So, we
2034 * don't need to call css_get()/css_tryget() before 2034 * don't need to call css_get()/css_tryget() before
2035 * calling consume_stock(). 2035 * calling consume_stock().
2036 */ 2036 */
2037 rcu_read_unlock(); 2037 rcu_read_unlock();
2038 goto done; 2038 goto done;
2039 } 2039 }
2040 /* after here, we may be blocked. we need to get refcnt */ 2040 /* after here, we may be blocked. we need to get refcnt */
2041 if (!css_tryget(&mem->css)) { 2041 if (!css_tryget(&mem->css)) {
2042 rcu_read_unlock(); 2042 rcu_read_unlock();
2043 goto again; 2043 goto again;
2044 } 2044 }
2045 rcu_read_unlock(); 2045 rcu_read_unlock();
2046 } 2046 }
2047 2047
2048 do { 2048 do {
2049 bool oom_check; 2049 bool oom_check;
2050 2050
2051 /* If killed, bypass charge */ 2051 /* If killed, bypass charge */
2052 if (fatal_signal_pending(current)) { 2052 if (fatal_signal_pending(current)) {
2053 css_put(&mem->css); 2053 css_put(&mem->css);
2054 goto bypass; 2054 goto bypass;
2055 } 2055 }
2056 2056
2057 oom_check = false; 2057 oom_check = false;
2058 if (oom && !nr_oom_retries) { 2058 if (oom && !nr_oom_retries) {
2059 oom_check = true; 2059 oom_check = true;
2060 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2060 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2061 } 2061 }
2062 2062
2063 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); 2063 ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
2064 switch (ret) { 2064 switch (ret) {
2065 case CHARGE_OK: 2065 case CHARGE_OK:
2066 break; 2066 break;
2067 case CHARGE_RETRY: /* not in OOM situation but retry */ 2067 case CHARGE_RETRY: /* not in OOM situation but retry */
2068 batch = nr_pages; 2068 batch = nr_pages;
2069 css_put(&mem->css); 2069 css_put(&mem->css);
2070 mem = NULL; 2070 mem = NULL;
2071 goto again; 2071 goto again;
2072 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2072 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
2073 css_put(&mem->css); 2073 css_put(&mem->css);
2074 goto nomem; 2074 goto nomem;
2075 case CHARGE_NOMEM: /* OOM routine works */ 2075 case CHARGE_NOMEM: /* OOM routine works */
2076 if (!oom) { 2076 if (!oom) {
2077 css_put(&mem->css); 2077 css_put(&mem->css);
2078 goto nomem; 2078 goto nomem;
2079 } 2079 }
2080 /* If oom, we never return -ENOMEM */ 2080 /* If oom, we never return -ENOMEM */
2081 nr_oom_retries--; 2081 nr_oom_retries--;
2082 break; 2082 break;
2083 case CHARGE_OOM_DIE: /* Killed by OOM Killer */ 2083 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2084 css_put(&mem->css); 2084 css_put(&mem->css);
2085 goto bypass; 2085 goto bypass;
2086 } 2086 }
2087 } while (ret != CHARGE_OK); 2087 } while (ret != CHARGE_OK);
2088 2088
2089 if (batch > nr_pages) 2089 if (batch > nr_pages)
2090 refill_stock(mem, batch - nr_pages); 2090 refill_stock(mem, batch - nr_pages);
2091 css_put(&mem->css); 2091 css_put(&mem->css);
2092 done: 2092 done:
2093 *memcg = mem; 2093 *memcg = mem;
2094 return 0; 2094 return 0;
2095 nomem: 2095 nomem:
2096 *memcg = NULL; 2096 *memcg = NULL;
2097 return -ENOMEM; 2097 return -ENOMEM;
2098 bypass: 2098 bypass:
2099 *memcg = NULL; 2099 *memcg = NULL;
2100 return 0; 2100 return 0;
2101 } 2101 }
2102 2102
2103 /* 2103 /*
2104 * Somemtimes we have to undo a charge we got by try_charge(). 2104 * Somemtimes we have to undo a charge we got by try_charge().
2105 * This function is for that and do uncharge, put css's refcnt. 2105 * This function is for that and do uncharge, put css's refcnt.
2106 * gotten by try_charge(). 2106 * gotten by try_charge().
2107 */ 2107 */
2108 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, 2108 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
2109 unsigned int nr_pages) 2109 unsigned int nr_pages)
2110 { 2110 {
2111 if (!mem_cgroup_is_root(mem)) { 2111 if (!mem_cgroup_is_root(mem)) {
2112 unsigned long bytes = nr_pages * PAGE_SIZE; 2112 unsigned long bytes = nr_pages * PAGE_SIZE;
2113 2113
2114 res_counter_uncharge(&mem->res, bytes); 2114 res_counter_uncharge(&mem->res, bytes);
2115 if (do_swap_account) 2115 if (do_swap_account)
2116 res_counter_uncharge(&mem->memsw, bytes); 2116 res_counter_uncharge(&mem->memsw, bytes);
2117 } 2117 }
2118 } 2118 }
2119 2119
2120 /* 2120 /*
2121 * A helper function to get mem_cgroup from ID. must be called under 2121 * A helper function to get mem_cgroup from ID. must be called under
2122 * rcu_read_lock(). The caller must check css_is_removed() or some if 2122 * rcu_read_lock(). The caller must check css_is_removed() or some if
2123 * it's concern. (dropping refcnt from swap can be called against removed 2123 * it's concern. (dropping refcnt from swap can be called against removed
2124 * memcg.) 2124 * memcg.)
2125 */ 2125 */
2126 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2126 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2127 { 2127 {
2128 struct cgroup_subsys_state *css; 2128 struct cgroup_subsys_state *css;
2129 2129
2130 /* ID 0 is unused ID */ 2130 /* ID 0 is unused ID */
2131 if (!id) 2131 if (!id)
2132 return NULL; 2132 return NULL;
2133 css = css_lookup(&mem_cgroup_subsys, id); 2133 css = css_lookup(&mem_cgroup_subsys, id);
2134 if (!css) 2134 if (!css)
2135 return NULL; 2135 return NULL;
2136 return container_of(css, struct mem_cgroup, css); 2136 return container_of(css, struct mem_cgroup, css);
2137 } 2137 }
2138 2138
2139 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2139 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2140 { 2140 {
2141 struct mem_cgroup *mem = NULL; 2141 struct mem_cgroup *mem = NULL;
2142 struct page_cgroup *pc; 2142 struct page_cgroup *pc;
2143 unsigned short id; 2143 unsigned short id;
2144 swp_entry_t ent; 2144 swp_entry_t ent;
2145 2145
2146 VM_BUG_ON(!PageLocked(page)); 2146 VM_BUG_ON(!PageLocked(page));
2147 2147
2148 pc = lookup_page_cgroup(page); 2148 pc = lookup_page_cgroup(page);
2149 lock_page_cgroup(pc); 2149 lock_page_cgroup(pc);
2150 if (PageCgroupUsed(pc)) { 2150 if (PageCgroupUsed(pc)) {
2151 mem = pc->mem_cgroup; 2151 mem = pc->mem_cgroup;
2152 if (mem && !css_tryget(&mem->css)) 2152 if (mem && !css_tryget(&mem->css))
2153 mem = NULL; 2153 mem = NULL;
2154 } else if (PageSwapCache(page)) { 2154 } else if (PageSwapCache(page)) {
2155 ent.val = page_private(page); 2155 ent.val = page_private(page);
2156 id = lookup_swap_cgroup(ent); 2156 id = lookup_swap_cgroup(ent);
2157 rcu_read_lock(); 2157 rcu_read_lock();
2158 mem = mem_cgroup_lookup(id); 2158 mem = mem_cgroup_lookup(id);
2159 if (mem && !css_tryget(&mem->css)) 2159 if (mem && !css_tryget(&mem->css))
2160 mem = NULL; 2160 mem = NULL;
2161 rcu_read_unlock(); 2161 rcu_read_unlock();
2162 } 2162 }
2163 unlock_page_cgroup(pc); 2163 unlock_page_cgroup(pc);
2164 return mem; 2164 return mem;
2165 } 2165 }
2166 2166
2167 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, 2167 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
2168 struct page *page, 2168 struct page *page,
2169 unsigned int nr_pages, 2169 unsigned int nr_pages,
2170 struct page_cgroup *pc, 2170 struct page_cgroup *pc,
2171 enum charge_type ctype) 2171 enum charge_type ctype)
2172 { 2172 {
2173 lock_page_cgroup(pc); 2173 lock_page_cgroup(pc);
2174 if (unlikely(PageCgroupUsed(pc))) { 2174 if (unlikely(PageCgroupUsed(pc))) {
2175 unlock_page_cgroup(pc); 2175 unlock_page_cgroup(pc);
2176 __mem_cgroup_cancel_charge(mem, nr_pages); 2176 __mem_cgroup_cancel_charge(mem, nr_pages);
2177 return; 2177 return;
2178 } 2178 }
2179 /* 2179 /*
2180 * we don't need page_cgroup_lock about tail pages, becase they are not 2180 * we don't need page_cgroup_lock about tail pages, becase they are not
2181 * accessed by any other context at this point. 2181 * accessed by any other context at this point.
2182 */ 2182 */
2183 pc->mem_cgroup = mem; 2183 pc->mem_cgroup = mem;
2184 /* 2184 /*
2185 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2185 * We access a page_cgroup asynchronously without lock_page_cgroup().
2186 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2186 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
2187 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2187 * is accessed after testing USED bit. To make pc->mem_cgroup visible
2188 * before USED bit, we need memory barrier here. 2188 * before USED bit, we need memory barrier here.
2189 * See mem_cgroup_add_lru_list(), etc. 2189 * See mem_cgroup_add_lru_list(), etc.
2190 */ 2190 */
2191 smp_wmb(); 2191 smp_wmb();
2192 switch (ctype) { 2192 switch (ctype) {
2193 case MEM_CGROUP_CHARGE_TYPE_CACHE: 2193 case MEM_CGROUP_CHARGE_TYPE_CACHE:
2194 case MEM_CGROUP_CHARGE_TYPE_SHMEM: 2194 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
2195 SetPageCgroupCache(pc); 2195 SetPageCgroupCache(pc);
2196 SetPageCgroupUsed(pc); 2196 SetPageCgroupUsed(pc);
2197 break; 2197 break;
2198 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2198 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2199 ClearPageCgroupCache(pc); 2199 ClearPageCgroupCache(pc);
2200 SetPageCgroupUsed(pc); 2200 SetPageCgroupUsed(pc);
2201 break; 2201 break;
2202 default: 2202 default:
2203 break; 2203 break;
2204 } 2204 }
2205 2205
2206 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); 2206 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
2207 unlock_page_cgroup(pc); 2207 unlock_page_cgroup(pc);
2208 /* 2208 /*
2209 * "charge_statistics" updated event counter. Then, check it. 2209 * "charge_statistics" updated event counter. Then, check it.
2210 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2210 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
2211 * if they exceeds softlimit. 2211 * if they exceeds softlimit.
2212 */ 2212 */
2213 memcg_check_events(mem, page); 2213 memcg_check_events(mem, page);
2214 } 2214 }
2215 2215
2216 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 2216 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2217 2217
2218 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ 2218 #define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
2219 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) 2219 (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
2220 /* 2220 /*
2221 * Because tail pages are not marked as "used", set it. We're under 2221 * Because tail pages are not marked as "used", set it. We're under
2222 * zone->lru_lock, 'splitting on pmd' and compund_lock. 2222 * zone->lru_lock, 'splitting on pmd' and compund_lock.
2223 */ 2223 */
2224 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) 2224 void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
2225 { 2225 {
2226 struct page_cgroup *head_pc = lookup_page_cgroup(head); 2226 struct page_cgroup *head_pc = lookup_page_cgroup(head);
2227 struct page_cgroup *tail_pc = lookup_page_cgroup(tail); 2227 struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
2228 unsigned long flags; 2228 unsigned long flags;
2229 2229
2230 if (mem_cgroup_disabled()) 2230 if (mem_cgroup_disabled())
2231 return; 2231 return;
2232 /* 2232 /*
2233 * We have no races with charge/uncharge but will have races with 2233 * We have no races with charge/uncharge but will have races with
2234 * page state accounting. 2234 * page state accounting.
2235 */ 2235 */
2236 move_lock_page_cgroup(head_pc, &flags); 2236 move_lock_page_cgroup(head_pc, &flags);
2237 2237
2238 tail_pc->mem_cgroup = head_pc->mem_cgroup; 2238 tail_pc->mem_cgroup = head_pc->mem_cgroup;
2239 smp_wmb(); /* see __commit_charge() */ 2239 smp_wmb(); /* see __commit_charge() */
2240 if (PageCgroupAcctLRU(head_pc)) { 2240 if (PageCgroupAcctLRU(head_pc)) {
2241 enum lru_list lru; 2241 enum lru_list lru;
2242 struct mem_cgroup_per_zone *mz; 2242 struct mem_cgroup_per_zone *mz;
2243 2243
2244 /* 2244 /*
2245 * LRU flags cannot be copied because we need to add tail 2245 * LRU flags cannot be copied because we need to add tail
2246 *.page to LRU by generic call and our hook will be called. 2246 *.page to LRU by generic call and our hook will be called.
2247 * We hold lru_lock, then, reduce counter directly. 2247 * We hold lru_lock, then, reduce counter directly.
2248 */ 2248 */
2249 lru = page_lru(head); 2249 lru = page_lru(head);
2250 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head); 2250 mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
2251 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 2251 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
2252 } 2252 }
2253 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 2253 tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
2254 move_unlock_page_cgroup(head_pc, &flags); 2254 move_unlock_page_cgroup(head_pc, &flags);
2255 } 2255 }
2256 #endif 2256 #endif
2257 2257
2258 /** 2258 /**
2259 * mem_cgroup_move_account - move account of the page 2259 * mem_cgroup_move_account - move account of the page
2260 * @page: the page 2260 * @page: the page
2261 * @nr_pages: number of regular pages (>1 for huge pages) 2261 * @nr_pages: number of regular pages (>1 for huge pages)
2262 * @pc: page_cgroup of the page. 2262 * @pc: page_cgroup of the page.
2263 * @from: mem_cgroup which the page is moved from. 2263 * @from: mem_cgroup which the page is moved from.
2264 * @to: mem_cgroup which the page is moved to. @from != @to. 2264 * @to: mem_cgroup which the page is moved to. @from != @to.
2265 * @uncharge: whether we should call uncharge and css_put against @from. 2265 * @uncharge: whether we should call uncharge and css_put against @from.
2266 * 2266 *
2267 * The caller must confirm following. 2267 * The caller must confirm following.
2268 * - page is not on LRU (isolate_page() is useful.) 2268 * - page is not on LRU (isolate_page() is useful.)
2269 * - compound_lock is held when nr_pages > 1 2269 * - compound_lock is held when nr_pages > 1
2270 * 2270 *
2271 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2271 * This function doesn't do "charge" nor css_get to new cgroup. It should be
2272 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2272 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
2273 * true, this function does "uncharge" from old cgroup, but it doesn't if 2273 * true, this function does "uncharge" from old cgroup, but it doesn't if
2274 * @uncharge is false, so a caller should do "uncharge". 2274 * @uncharge is false, so a caller should do "uncharge".
2275 */ 2275 */
2276 static int mem_cgroup_move_account(struct page *page, 2276 static int mem_cgroup_move_account(struct page *page,
2277 unsigned int nr_pages, 2277 unsigned int nr_pages,
2278 struct page_cgroup *pc, 2278 struct page_cgroup *pc,
2279 struct mem_cgroup *from, 2279 struct mem_cgroup *from,
2280 struct mem_cgroup *to, 2280 struct mem_cgroup *to,
2281 bool uncharge) 2281 bool uncharge)
2282 { 2282 {
2283 unsigned long flags; 2283 unsigned long flags;
2284 int ret; 2284 int ret;
2285 2285
2286 VM_BUG_ON(from == to); 2286 VM_BUG_ON(from == to);
2287 VM_BUG_ON(PageLRU(page)); 2287 VM_BUG_ON(PageLRU(page));
2288 /* 2288 /*
2289 * The page is isolated from LRU. So, collapse function 2289 * The page is isolated from LRU. So, collapse function
2290 * will not handle this page. But page splitting can happen. 2290 * will not handle this page. But page splitting can happen.
2291 * Do this check under compound_page_lock(). The caller should 2291 * Do this check under compound_page_lock(). The caller should
2292 * hold it. 2292 * hold it.
2293 */ 2293 */
2294 ret = -EBUSY; 2294 ret = -EBUSY;
2295 if (nr_pages > 1 && !PageTransHuge(page)) 2295 if (nr_pages > 1 && !PageTransHuge(page))
2296 goto out; 2296 goto out;
2297 2297
2298 lock_page_cgroup(pc); 2298 lock_page_cgroup(pc);
2299 2299
2300 ret = -EINVAL; 2300 ret = -EINVAL;
2301 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 2301 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
2302 goto unlock; 2302 goto unlock;
2303 2303
2304 move_lock_page_cgroup(pc, &flags); 2304 move_lock_page_cgroup(pc, &flags);
2305 2305
2306 if (PageCgroupFileMapped(pc)) { 2306 if (PageCgroupFileMapped(pc)) {
2307 /* Update mapped_file data for mem_cgroup */ 2307 /* Update mapped_file data for mem_cgroup */
2308 preempt_disable(); 2308 preempt_disable();
2309 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2309 __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2310 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); 2310 __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
2311 preempt_enable(); 2311 preempt_enable();
2312 } 2312 }
2313 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); 2313 mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
2314 if (uncharge) 2314 if (uncharge)
2315 /* This is not "cancel", but cancel_charge does all we need. */ 2315 /* This is not "cancel", but cancel_charge does all we need. */
2316 __mem_cgroup_cancel_charge(from, nr_pages); 2316 __mem_cgroup_cancel_charge(from, nr_pages);
2317 2317
2318 /* caller should have done css_get */ 2318 /* caller should have done css_get */
2319 pc->mem_cgroup = to; 2319 pc->mem_cgroup = to;
2320 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); 2320 mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
2321 /* 2321 /*
2322 * We charges against "to" which may not have any tasks. Then, "to" 2322 * We charges against "to" which may not have any tasks. Then, "to"
2323 * can be under rmdir(). But in current implementation, caller of 2323 * can be under rmdir(). But in current implementation, caller of
2324 * this function is just force_empty() and move charge, so it's 2324 * this function is just force_empty() and move charge, so it's
2325 * guaranteed that "to" is never removed. So, we don't check rmdir 2325 * guaranteed that "to" is never removed. So, we don't check rmdir
2326 * status here. 2326 * status here.
2327 */ 2327 */
2328 move_unlock_page_cgroup(pc, &flags); 2328 move_unlock_page_cgroup(pc, &flags);
2329 ret = 0; 2329 ret = 0;
2330 unlock: 2330 unlock:
2331 unlock_page_cgroup(pc); 2331 unlock_page_cgroup(pc);
2332 /* 2332 /*
2333 * check events 2333 * check events
2334 */ 2334 */
2335 memcg_check_events(to, page); 2335 memcg_check_events(to, page);
2336 memcg_check_events(from, page); 2336 memcg_check_events(from, page);
2337 out: 2337 out:
2338 return ret; 2338 return ret;
2339 } 2339 }
2340 2340
2341 /* 2341 /*
2342 * move charges to its parent. 2342 * move charges to its parent.
2343 */ 2343 */
2344 2344
2345 static int mem_cgroup_move_parent(struct page *page, 2345 static int mem_cgroup_move_parent(struct page *page,
2346 struct page_cgroup *pc, 2346 struct page_cgroup *pc,
2347 struct mem_cgroup *child, 2347 struct mem_cgroup *child,
2348 gfp_t gfp_mask) 2348 gfp_t gfp_mask)
2349 { 2349 {
2350 struct cgroup *cg = child->css.cgroup; 2350 struct cgroup *cg = child->css.cgroup;
2351 struct cgroup *pcg = cg->parent; 2351 struct cgroup *pcg = cg->parent;
2352 struct mem_cgroup *parent; 2352 struct mem_cgroup *parent;
2353 unsigned int nr_pages; 2353 unsigned int nr_pages;
2354 unsigned long uninitialized_var(flags); 2354 unsigned long uninitialized_var(flags);
2355 int ret; 2355 int ret;
2356 2356
2357 /* Is ROOT ? */ 2357 /* Is ROOT ? */
2358 if (!pcg) 2358 if (!pcg)
2359 return -EINVAL; 2359 return -EINVAL;
2360 2360
2361 ret = -EBUSY; 2361 ret = -EBUSY;
2362 if (!get_page_unless_zero(page)) 2362 if (!get_page_unless_zero(page))
2363 goto out; 2363 goto out;
2364 if (isolate_lru_page(page)) 2364 if (isolate_lru_page(page))
2365 goto put; 2365 goto put;
2366 2366
2367 nr_pages = hpage_nr_pages(page); 2367 nr_pages = hpage_nr_pages(page);
2368 2368
2369 parent = mem_cgroup_from_cont(pcg); 2369 parent = mem_cgroup_from_cont(pcg);
2370 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2370 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
2371 if (ret || !parent) 2371 if (ret || !parent)
2372 goto put_back; 2372 goto put_back;
2373 2373
2374 if (nr_pages > 1) 2374 if (nr_pages > 1)
2375 flags = compound_lock_irqsave(page); 2375 flags = compound_lock_irqsave(page);
2376 2376
2377 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2377 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
2378 if (ret) 2378 if (ret)
2379 __mem_cgroup_cancel_charge(parent, nr_pages); 2379 __mem_cgroup_cancel_charge(parent, nr_pages);
2380 2380
2381 if (nr_pages > 1) 2381 if (nr_pages > 1)
2382 compound_unlock_irqrestore(page, flags); 2382 compound_unlock_irqrestore(page, flags);
2383 put_back: 2383 put_back:
2384 putback_lru_page(page); 2384 putback_lru_page(page);
2385 put: 2385 put:
2386 put_page(page); 2386 put_page(page);
2387 out: 2387 out:
2388 return ret; 2388 return ret;
2389 } 2389 }
2390 2390
2391 /* 2391 /*
2392 * Charge the memory controller for page usage. 2392 * Charge the memory controller for page usage.
2393 * Return 2393 * Return
2394 * 0 if the charge was successful 2394 * 0 if the charge was successful
2395 * < 0 if the cgroup is over its limit 2395 * < 0 if the cgroup is over its limit
2396 */ 2396 */
2397 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 2397 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
2398 gfp_t gfp_mask, enum charge_type ctype) 2398 gfp_t gfp_mask, enum charge_type ctype)
2399 { 2399 {
2400 struct mem_cgroup *mem = NULL; 2400 struct mem_cgroup *mem = NULL;
2401 unsigned int nr_pages = 1; 2401 unsigned int nr_pages = 1;
2402 struct page_cgroup *pc; 2402 struct page_cgroup *pc;
2403 bool oom = true; 2403 bool oom = true;
2404 int ret; 2404 int ret;
2405 2405
2406 if (PageTransHuge(page)) { 2406 if (PageTransHuge(page)) {
2407 nr_pages <<= compound_order(page); 2407 nr_pages <<= compound_order(page);
2408 VM_BUG_ON(!PageTransHuge(page)); 2408 VM_BUG_ON(!PageTransHuge(page));
2409 /* 2409 /*
2410 * Never OOM-kill a process for a huge page. The 2410 * Never OOM-kill a process for a huge page. The
2411 * fault handler will fall back to regular pages. 2411 * fault handler will fall back to regular pages.
2412 */ 2412 */
2413 oom = false; 2413 oom = false;
2414 } 2414 }
2415 2415
2416 pc = lookup_page_cgroup(page); 2416 pc = lookup_page_cgroup(page);
2417 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ 2417 BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
2418 2418
2419 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); 2419 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
2420 if (ret || !mem) 2420 if (ret || !mem)
2421 return ret; 2421 return ret;
2422 2422
2423 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); 2423 __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
2424 return 0; 2424 return 0;
2425 } 2425 }
2426 2426
2427 int mem_cgroup_newpage_charge(struct page *page, 2427 int mem_cgroup_newpage_charge(struct page *page,
2428 struct mm_struct *mm, gfp_t gfp_mask) 2428 struct mm_struct *mm, gfp_t gfp_mask)
2429 { 2429 {
2430 if (mem_cgroup_disabled()) 2430 if (mem_cgroup_disabled())
2431 return 0; 2431 return 0;
2432 /* 2432 /*
2433 * If already mapped, we don't have to account. 2433 * If already mapped, we don't have to account.
2434 * If page cache, page->mapping has address_space. 2434 * If page cache, page->mapping has address_space.
2435 * But page->mapping may have out-of-use anon_vma pointer, 2435 * But page->mapping may have out-of-use anon_vma pointer,
2436 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping 2436 * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
2437 * is NULL. 2437 * is NULL.
2438 */ 2438 */
2439 if (page_mapped(page) || (page->mapping && !PageAnon(page))) 2439 if (page_mapped(page) || (page->mapping && !PageAnon(page)))
2440 return 0; 2440 return 0;
2441 if (unlikely(!mm)) 2441 if (unlikely(!mm))
2442 mm = &init_mm; 2442 mm = &init_mm;
2443 return mem_cgroup_charge_common(page, mm, gfp_mask, 2443 return mem_cgroup_charge_common(page, mm, gfp_mask,
2444 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2444 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2445 } 2445 }
2446 2446
2447 static void 2447 static void
2448 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2448 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2449 enum charge_type ctype); 2449 enum charge_type ctype);
2450 2450
2451 static void 2451 static void
2452 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, 2452 __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
2453 enum charge_type ctype) 2453 enum charge_type ctype)
2454 { 2454 {
2455 struct page_cgroup *pc = lookup_page_cgroup(page); 2455 struct page_cgroup *pc = lookup_page_cgroup(page);
2456 /* 2456 /*
2457 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page 2457 * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
2458 * is already on LRU. It means the page may on some other page_cgroup's 2458 * is already on LRU. It means the page may on some other page_cgroup's
2459 * LRU. Take care of it. 2459 * LRU. Take care of it.
2460 */ 2460 */
2461 mem_cgroup_lru_del_before_commit(page); 2461 mem_cgroup_lru_del_before_commit(page);
2462 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 2462 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
2463 mem_cgroup_lru_add_after_commit(page); 2463 mem_cgroup_lru_add_after_commit(page);
2464 return; 2464 return;
2465 } 2465 }
2466 2466
2467 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 2467 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
2468 gfp_t gfp_mask) 2468 gfp_t gfp_mask)
2469 { 2469 {
2470 struct mem_cgroup *mem = NULL; 2470 struct mem_cgroup *mem = NULL;
2471 int ret; 2471 int ret;
2472 2472
2473 if (mem_cgroup_disabled()) 2473 if (mem_cgroup_disabled())
2474 return 0; 2474 return 0;
2475 if (PageCompound(page)) 2475 if (PageCompound(page))
2476 return 0; 2476 return 0;
2477 /* 2477 /*
2478 * Corner case handling. This is called from add_to_page_cache() 2478 * Corner case handling. This is called from add_to_page_cache()
2479 * in usual. But some FS (shmem) precharges this page before calling it 2479 * in usual. But some FS (shmem) precharges this page before calling it
2480 * and call add_to_page_cache() with GFP_NOWAIT. 2480 * and call add_to_page_cache() with GFP_NOWAIT.
2481 * 2481 *
2482 * For GFP_NOWAIT case, the page may be pre-charged before calling 2482 * For GFP_NOWAIT case, the page may be pre-charged before calling
2483 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 2483 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
2484 * charge twice. (It works but has to pay a bit larger cost.) 2484 * charge twice. (It works but has to pay a bit larger cost.)
2485 * And when the page is SwapCache, it should take swap information 2485 * And when the page is SwapCache, it should take swap information
2486 * into account. This is under lock_page() now. 2486 * into account. This is under lock_page() now.
2487 */ 2487 */
2488 if (!(gfp_mask & __GFP_WAIT)) { 2488 if (!(gfp_mask & __GFP_WAIT)) {
2489 struct page_cgroup *pc; 2489 struct page_cgroup *pc;
2490 2490
2491 pc = lookup_page_cgroup(page); 2491 pc = lookup_page_cgroup(page);
2492 if (!pc) 2492 if (!pc)
2493 return 0; 2493 return 0;
2494 lock_page_cgroup(pc); 2494 lock_page_cgroup(pc);
2495 if (PageCgroupUsed(pc)) { 2495 if (PageCgroupUsed(pc)) {
2496 unlock_page_cgroup(pc); 2496 unlock_page_cgroup(pc);
2497 return 0; 2497 return 0;
2498 } 2498 }
2499 unlock_page_cgroup(pc); 2499 unlock_page_cgroup(pc);
2500 } 2500 }
2501 2501
2502 if (unlikely(!mm)) 2502 if (unlikely(!mm))
2503 mm = &init_mm; 2503 mm = &init_mm;
2504 2504
2505 if (page_is_file_cache(page)) { 2505 if (page_is_file_cache(page)) {
2506 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); 2506 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
2507 if (ret || !mem) 2507 if (ret || !mem)
2508 return ret; 2508 return ret;
2509 2509
2510 /* 2510 /*
2511 * FUSE reuses pages without going through the final 2511 * FUSE reuses pages without going through the final
2512 * put that would remove them from the LRU list, make 2512 * put that would remove them from the LRU list, make
2513 * sure that they get relinked properly. 2513 * sure that they get relinked properly.
2514 */ 2514 */
2515 __mem_cgroup_commit_charge_lrucare(page, mem, 2515 __mem_cgroup_commit_charge_lrucare(page, mem,
2516 MEM_CGROUP_CHARGE_TYPE_CACHE); 2516 MEM_CGROUP_CHARGE_TYPE_CACHE);
2517 return ret; 2517 return ret;
2518 } 2518 }
2519 /* shmem */ 2519 /* shmem */
2520 if (PageSwapCache(page)) { 2520 if (PageSwapCache(page)) {
2521 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 2521 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
2522 if (!ret) 2522 if (!ret)
2523 __mem_cgroup_commit_charge_swapin(page, mem, 2523 __mem_cgroup_commit_charge_swapin(page, mem,
2524 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2524 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2525 } else 2525 } else
2526 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 2526 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
2527 MEM_CGROUP_CHARGE_TYPE_SHMEM); 2527 MEM_CGROUP_CHARGE_TYPE_SHMEM);
2528 2528
2529 return ret; 2529 return ret;
2530 } 2530 }
2531 2531
2532 /* 2532 /*
2533 * While swap-in, try_charge -> commit or cancel, the page is locked. 2533 * While swap-in, try_charge -> commit or cancel, the page is locked.
2534 * And when try_charge() successfully returns, one refcnt to memcg without 2534 * And when try_charge() successfully returns, one refcnt to memcg without
2535 * struct page_cgroup is acquired. This refcnt will be consumed by 2535 * struct page_cgroup is acquired. This refcnt will be consumed by
2536 * "commit()" or removed by "cancel()" 2536 * "commit()" or removed by "cancel()"
2537 */ 2537 */
2538 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 2538 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
2539 struct page *page, 2539 struct page *page,
2540 gfp_t mask, struct mem_cgroup **ptr) 2540 gfp_t mask, struct mem_cgroup **ptr)
2541 { 2541 {
2542 struct mem_cgroup *mem; 2542 struct mem_cgroup *mem;
2543 int ret; 2543 int ret;
2544 2544
2545 *ptr = NULL; 2545 *ptr = NULL;
2546 2546
2547 if (mem_cgroup_disabled()) 2547 if (mem_cgroup_disabled())
2548 return 0; 2548 return 0;
2549 2549
2550 if (!do_swap_account) 2550 if (!do_swap_account)
2551 goto charge_cur_mm; 2551 goto charge_cur_mm;
2552 /* 2552 /*
2553 * A racing thread's fault, or swapoff, may have already updated 2553 * A racing thread's fault, or swapoff, may have already updated
2554 * the pte, and even removed page from swap cache: in those cases 2554 * the pte, and even removed page from swap cache: in those cases
2555 * do_swap_page()'s pte_same() test will fail; but there's also a 2555 * do_swap_page()'s pte_same() test will fail; but there's also a
2556 * KSM case which does need to charge the page. 2556 * KSM case which does need to charge the page.
2557 */ 2557 */
2558 if (!PageSwapCache(page)) 2558 if (!PageSwapCache(page))
2559 goto charge_cur_mm; 2559 goto charge_cur_mm;
2560 mem = try_get_mem_cgroup_from_page(page); 2560 mem = try_get_mem_cgroup_from_page(page);
2561 if (!mem) 2561 if (!mem)
2562 goto charge_cur_mm; 2562 goto charge_cur_mm;
2563 *ptr = mem; 2563 *ptr = mem;
2564 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); 2564 ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
2565 css_put(&mem->css); 2565 css_put(&mem->css);
2566 return ret; 2566 return ret;
2567 charge_cur_mm: 2567 charge_cur_mm:
2568 if (unlikely(!mm)) 2568 if (unlikely(!mm))
2569 mm = &init_mm; 2569 mm = &init_mm;
2570 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true); 2570 return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
2571 } 2571 }
2572 2572
2573 static void 2573 static void
2574 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, 2574 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
2575 enum charge_type ctype) 2575 enum charge_type ctype)
2576 { 2576 {
2577 if (mem_cgroup_disabled()) 2577 if (mem_cgroup_disabled())
2578 return; 2578 return;
2579 if (!ptr) 2579 if (!ptr)
2580 return; 2580 return;
2581 cgroup_exclude_rmdir(&ptr->css); 2581 cgroup_exclude_rmdir(&ptr->css);
2582 2582
2583 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype); 2583 __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
2584 /* 2584 /*
2585 * Now swap is on-memory. This means this page may be 2585 * Now swap is on-memory. This means this page may be
2586 * counted both as mem and swap....double count. 2586 * counted both as mem and swap....double count.
2587 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 2587 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
2588 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 2588 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
2589 * may call delete_from_swap_cache() before reach here. 2589 * may call delete_from_swap_cache() before reach here.
2590 */ 2590 */
2591 if (do_swap_account && PageSwapCache(page)) { 2591 if (do_swap_account && PageSwapCache(page)) {
2592 swp_entry_t ent = {.val = page_private(page)}; 2592 swp_entry_t ent = {.val = page_private(page)};
2593 unsigned short id; 2593 unsigned short id;
2594 struct mem_cgroup *memcg; 2594 struct mem_cgroup *memcg;
2595 2595
2596 id = swap_cgroup_record(ent, 0); 2596 id = swap_cgroup_record(ent, 0);
2597 rcu_read_lock(); 2597 rcu_read_lock();
2598 memcg = mem_cgroup_lookup(id); 2598 memcg = mem_cgroup_lookup(id);
2599 if (memcg) { 2599 if (memcg) {
2600 /* 2600 /*
2601 * This recorded memcg can be obsolete one. So, avoid 2601 * This recorded memcg can be obsolete one. So, avoid
2602 * calling css_tryget 2602 * calling css_tryget
2603 */ 2603 */
2604 if (!mem_cgroup_is_root(memcg)) 2604 if (!mem_cgroup_is_root(memcg))
2605 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2605 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2606 mem_cgroup_swap_statistics(memcg, false); 2606 mem_cgroup_swap_statistics(memcg, false);
2607 mem_cgroup_put(memcg); 2607 mem_cgroup_put(memcg);
2608 } 2608 }
2609 rcu_read_unlock(); 2609 rcu_read_unlock();
2610 } 2610 }
2611 /* 2611 /*
2612 * At swapin, we may charge account against cgroup which has no tasks. 2612 * At swapin, we may charge account against cgroup which has no tasks.
2613 * So, rmdir()->pre_destroy() can be called while we do this charge. 2613 * So, rmdir()->pre_destroy() can be called while we do this charge.
2614 * In that case, we need to call pre_destroy() again. check it here. 2614 * In that case, we need to call pre_destroy() again. check it here.
2615 */ 2615 */
2616 cgroup_release_and_wakeup_rmdir(&ptr->css); 2616 cgroup_release_and_wakeup_rmdir(&ptr->css);
2617 } 2617 }
2618 2618
2619 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 2619 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
2620 { 2620 {
2621 __mem_cgroup_commit_charge_swapin(page, ptr, 2621 __mem_cgroup_commit_charge_swapin(page, ptr,
2622 MEM_CGROUP_CHARGE_TYPE_MAPPED); 2622 MEM_CGROUP_CHARGE_TYPE_MAPPED);
2623 } 2623 }
2624 2624
2625 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 2625 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
2626 { 2626 {
2627 if (mem_cgroup_disabled()) 2627 if (mem_cgroup_disabled())
2628 return; 2628 return;
2629 if (!mem) 2629 if (!mem)
2630 return; 2630 return;
2631 __mem_cgroup_cancel_charge(mem, 1); 2631 __mem_cgroup_cancel_charge(mem, 1);
2632 } 2632 }
2633 2633
2634 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, 2634 static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
2635 unsigned int nr_pages, 2635 unsigned int nr_pages,
2636 const enum charge_type ctype) 2636 const enum charge_type ctype)
2637 { 2637 {
2638 struct memcg_batch_info *batch = NULL; 2638 struct memcg_batch_info *batch = NULL;
2639 bool uncharge_memsw = true; 2639 bool uncharge_memsw = true;
2640 2640
2641 /* If swapout, usage of swap doesn't decrease */ 2641 /* If swapout, usage of swap doesn't decrease */
2642 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 2642 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
2643 uncharge_memsw = false; 2643 uncharge_memsw = false;
2644 2644
2645 batch = &current->memcg_batch; 2645 batch = &current->memcg_batch;
2646 /* 2646 /*
2647 * In usual, we do css_get() when we remember memcg pointer. 2647 * In usual, we do css_get() when we remember memcg pointer.
2648 * But in this case, we keep res->usage until end of a series of 2648 * But in this case, we keep res->usage until end of a series of
2649 * uncharges. Then, it's ok to ignore memcg's refcnt. 2649 * uncharges. Then, it's ok to ignore memcg's refcnt.
2650 */ 2650 */
2651 if (!batch->memcg) 2651 if (!batch->memcg)
2652 batch->memcg = mem; 2652 batch->memcg = mem;
2653 /* 2653 /*
2654 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 2654 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
2655 * In those cases, all pages freed continuously can be expected to be in 2655 * In those cases, all pages freed continuously can be expected to be in
2656 * the same cgroup and we have chance to coalesce uncharges. 2656 * the same cgroup and we have chance to coalesce uncharges.
2657 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 2657 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
2658 * because we want to do uncharge as soon as possible. 2658 * because we want to do uncharge as soon as possible.
2659 */ 2659 */
2660 2660
2661 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 2661 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
2662 goto direct_uncharge; 2662 goto direct_uncharge;
2663 2663
2664 if (nr_pages > 1) 2664 if (nr_pages > 1)
2665 goto direct_uncharge; 2665 goto direct_uncharge;
2666 2666
2667 /* 2667 /*
2668 * In typical case, batch->memcg == mem. This means we can 2668 * In typical case, batch->memcg == mem. This means we can
2669 * merge a series of uncharges to an uncharge of res_counter. 2669 * merge a series of uncharges to an uncharge of res_counter.
2670 * If not, we uncharge res_counter ony by one. 2670 * If not, we uncharge res_counter ony by one.
2671 */ 2671 */
2672 if (batch->memcg != mem) 2672 if (batch->memcg != mem)
2673 goto direct_uncharge; 2673 goto direct_uncharge;
2674 /* remember freed charge and uncharge it later */ 2674 /* remember freed charge and uncharge it later */
2675 batch->nr_pages++; 2675 batch->nr_pages++;
2676 if (uncharge_memsw) 2676 if (uncharge_memsw)
2677 batch->memsw_nr_pages++; 2677 batch->memsw_nr_pages++;
2678 return; 2678 return;
2679 direct_uncharge: 2679 direct_uncharge:
2680 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); 2680 res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
2681 if (uncharge_memsw) 2681 if (uncharge_memsw)
2682 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); 2682 res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
2683 if (unlikely(batch->memcg != mem)) 2683 if (unlikely(batch->memcg != mem))
2684 memcg_oom_recover(mem); 2684 memcg_oom_recover(mem);
2685 return; 2685 return;
2686 } 2686 }
2687 2687
2688 /* 2688 /*
2689 * uncharge if !page_mapped(page) 2689 * uncharge if !page_mapped(page)
2690 */ 2690 */
2691 static struct mem_cgroup * 2691 static struct mem_cgroup *
2692 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 2692 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
2693 { 2693 {
2694 struct mem_cgroup *mem = NULL; 2694 struct mem_cgroup *mem = NULL;
2695 unsigned int nr_pages = 1; 2695 unsigned int nr_pages = 1;
2696 struct page_cgroup *pc; 2696 struct page_cgroup *pc;
2697 2697
2698 if (mem_cgroup_disabled()) 2698 if (mem_cgroup_disabled())
2699 return NULL; 2699 return NULL;
2700 2700
2701 if (PageSwapCache(page)) 2701 if (PageSwapCache(page))
2702 return NULL; 2702 return NULL;
2703 2703
2704 if (PageTransHuge(page)) { 2704 if (PageTransHuge(page)) {
2705 nr_pages <<= compound_order(page); 2705 nr_pages <<= compound_order(page);
2706 VM_BUG_ON(!PageTransHuge(page)); 2706 VM_BUG_ON(!PageTransHuge(page));
2707 } 2707 }
2708 /* 2708 /*
2709 * Check if our page_cgroup is valid 2709 * Check if our page_cgroup is valid
2710 */ 2710 */
2711 pc = lookup_page_cgroup(page); 2711 pc = lookup_page_cgroup(page);
2712 if (unlikely(!pc || !PageCgroupUsed(pc))) 2712 if (unlikely(!pc || !PageCgroupUsed(pc)))
2713 return NULL; 2713 return NULL;
2714 2714
2715 lock_page_cgroup(pc); 2715 lock_page_cgroup(pc);
2716 2716
2717 mem = pc->mem_cgroup; 2717 mem = pc->mem_cgroup;
2718 2718
2719 if (!PageCgroupUsed(pc)) 2719 if (!PageCgroupUsed(pc))
2720 goto unlock_out; 2720 goto unlock_out;
2721 2721
2722 switch (ctype) { 2722 switch (ctype) {
2723 case MEM_CGROUP_CHARGE_TYPE_MAPPED: 2723 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
2724 case MEM_CGROUP_CHARGE_TYPE_DROP: 2724 case MEM_CGROUP_CHARGE_TYPE_DROP:
2725 /* See mem_cgroup_prepare_migration() */ 2725 /* See mem_cgroup_prepare_migration() */
2726 if (page_mapped(page) || PageCgroupMigration(pc)) 2726 if (page_mapped(page) || PageCgroupMigration(pc))
2727 goto unlock_out; 2727 goto unlock_out;
2728 break; 2728 break;
2729 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 2729 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
2730 if (!PageAnon(page)) { /* Shared memory */ 2730 if (!PageAnon(page)) { /* Shared memory */
2731 if (page->mapping && !page_is_file_cache(page)) 2731 if (page->mapping && !page_is_file_cache(page))
2732 goto unlock_out; 2732 goto unlock_out;
2733 } else if (page_mapped(page)) /* Anon */ 2733 } else if (page_mapped(page)) /* Anon */
2734 goto unlock_out; 2734 goto unlock_out;
2735 break; 2735 break;
2736 default: 2736 default:
2737 break; 2737 break;
2738 } 2738 }
2739 2739
2740 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); 2740 mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
2741 2741
2742 ClearPageCgroupUsed(pc); 2742 ClearPageCgroupUsed(pc);
2743 /* 2743 /*
2744 * pc->mem_cgroup is not cleared here. It will be accessed when it's 2744 * pc->mem_cgroup is not cleared here. It will be accessed when it's
2745 * freed from LRU. This is safe because uncharged page is expected not 2745 * freed from LRU. This is safe because uncharged page is expected not
2746 * to be reused (freed soon). Exception is SwapCache, it's handled by 2746 * to be reused (freed soon). Exception is SwapCache, it's handled by
2747 * special functions. 2747 * special functions.
2748 */ 2748 */
2749 2749
2750 unlock_page_cgroup(pc); 2750 unlock_page_cgroup(pc);
2751 /* 2751 /*
2752 * even after unlock, we have mem->res.usage here and this memcg 2752 * even after unlock, we have mem->res.usage here and this memcg
2753 * will never be freed. 2753 * will never be freed.
2754 */ 2754 */
2755 memcg_check_events(mem, page); 2755 memcg_check_events(mem, page);
2756 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 2756 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
2757 mem_cgroup_swap_statistics(mem, true); 2757 mem_cgroup_swap_statistics(mem, true);
2758 mem_cgroup_get(mem); 2758 mem_cgroup_get(mem);
2759 } 2759 }
2760 if (!mem_cgroup_is_root(mem)) 2760 if (!mem_cgroup_is_root(mem))
2761 mem_cgroup_do_uncharge(mem, nr_pages, ctype); 2761 mem_cgroup_do_uncharge(mem, nr_pages, ctype);
2762 2762
2763 return mem; 2763 return mem;
2764 2764
2765 unlock_out: 2765 unlock_out:
2766 unlock_page_cgroup(pc); 2766 unlock_page_cgroup(pc);
2767 return NULL; 2767 return NULL;
2768 } 2768 }
2769 2769
2770 void mem_cgroup_uncharge_page(struct page *page) 2770 void mem_cgroup_uncharge_page(struct page *page)
2771 { 2771 {
2772 /* early check. */ 2772 /* early check. */
2773 if (page_mapped(page)) 2773 if (page_mapped(page))
2774 return; 2774 return;
2775 if (page->mapping && !PageAnon(page)) 2775 if (page->mapping && !PageAnon(page))
2776 return; 2776 return;
2777 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 2777 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
2778 } 2778 }
2779 2779
2780 void mem_cgroup_uncharge_cache_page(struct page *page) 2780 void mem_cgroup_uncharge_cache_page(struct page *page)
2781 { 2781 {
2782 VM_BUG_ON(page_mapped(page)); 2782 VM_BUG_ON(page_mapped(page));
2783 VM_BUG_ON(page->mapping); 2783 VM_BUG_ON(page->mapping);
2784 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 2784 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
2785 } 2785 }
2786 2786
2787 /* 2787 /*
2788 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 2788 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
2789 * In that cases, pages are freed continuously and we can expect pages 2789 * In that cases, pages are freed continuously and we can expect pages
2790 * are in the same memcg. All these calls itself limits the number of 2790 * are in the same memcg. All these calls itself limits the number of
2791 * pages freed at once, then uncharge_start/end() is called properly. 2791 * pages freed at once, then uncharge_start/end() is called properly.
2792 * This may be called prural(2) times in a context, 2792 * This may be called prural(2) times in a context,
2793 */ 2793 */
2794 2794
2795 void mem_cgroup_uncharge_start(void) 2795 void mem_cgroup_uncharge_start(void)
2796 { 2796 {
2797 current->memcg_batch.do_batch++; 2797 current->memcg_batch.do_batch++;
2798 /* We can do nest. */ 2798 /* We can do nest. */
2799 if (current->memcg_batch.do_batch == 1) { 2799 if (current->memcg_batch.do_batch == 1) {
2800 current->memcg_batch.memcg = NULL; 2800 current->memcg_batch.memcg = NULL;
2801 current->memcg_batch.nr_pages = 0; 2801 current->memcg_batch.nr_pages = 0;
2802 current->memcg_batch.memsw_nr_pages = 0; 2802 current->memcg_batch.memsw_nr_pages = 0;
2803 } 2803 }
2804 } 2804 }
2805 2805
2806 void mem_cgroup_uncharge_end(void) 2806 void mem_cgroup_uncharge_end(void)
2807 { 2807 {
2808 struct memcg_batch_info *batch = &current->memcg_batch; 2808 struct memcg_batch_info *batch = &current->memcg_batch;
2809 2809
2810 if (!batch->do_batch) 2810 if (!batch->do_batch)
2811 return; 2811 return;
2812 2812
2813 batch->do_batch--; 2813 batch->do_batch--;
2814 if (batch->do_batch) /* If stacked, do nothing. */ 2814 if (batch->do_batch) /* If stacked, do nothing. */
2815 return; 2815 return;
2816 2816
2817 if (!batch->memcg) 2817 if (!batch->memcg)
2818 return; 2818 return;
2819 /* 2819 /*
2820 * This "batch->memcg" is valid without any css_get/put etc... 2820 * This "batch->memcg" is valid without any css_get/put etc...
2821 * bacause we hide charges behind us. 2821 * bacause we hide charges behind us.
2822 */ 2822 */
2823 if (batch->nr_pages) 2823 if (batch->nr_pages)
2824 res_counter_uncharge(&batch->memcg->res, 2824 res_counter_uncharge(&batch->memcg->res,
2825 batch->nr_pages * PAGE_SIZE); 2825 batch->nr_pages * PAGE_SIZE);
2826 if (batch->memsw_nr_pages) 2826 if (batch->memsw_nr_pages)
2827 res_counter_uncharge(&batch->memcg->memsw, 2827 res_counter_uncharge(&batch->memcg->memsw,
2828 batch->memsw_nr_pages * PAGE_SIZE); 2828 batch->memsw_nr_pages * PAGE_SIZE);
2829 memcg_oom_recover(batch->memcg); 2829 memcg_oom_recover(batch->memcg);
2830 /* forget this pointer (for sanity check) */ 2830 /* forget this pointer (for sanity check) */
2831 batch->memcg = NULL; 2831 batch->memcg = NULL;
2832 } 2832 }
2833 2833
2834 #ifdef CONFIG_SWAP 2834 #ifdef CONFIG_SWAP
2835 /* 2835 /*
2836 * called after __delete_from_swap_cache() and drop "page" account. 2836 * called after __delete_from_swap_cache() and drop "page" account.
2837 * memcg information is recorded to swap_cgroup of "ent" 2837 * memcg information is recorded to swap_cgroup of "ent"
2838 */ 2838 */
2839 void 2839 void
2840 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 2840 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
2841 { 2841 {
2842 struct mem_cgroup *memcg; 2842 struct mem_cgroup *memcg;
2843 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 2843 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
2844 2844
2845 if (!swapout) /* this was a swap cache but the swap is unused ! */ 2845 if (!swapout) /* this was a swap cache but the swap is unused ! */
2846 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 2846 ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
2847 2847
2848 memcg = __mem_cgroup_uncharge_common(page, ctype); 2848 memcg = __mem_cgroup_uncharge_common(page, ctype);
2849 2849
2850 /* 2850 /*
2851 * record memcg information, if swapout && memcg != NULL, 2851 * record memcg information, if swapout && memcg != NULL,
2852 * mem_cgroup_get() was called in uncharge(). 2852 * mem_cgroup_get() was called in uncharge().
2853 */ 2853 */
2854 if (do_swap_account && swapout && memcg) 2854 if (do_swap_account && swapout && memcg)
2855 swap_cgroup_record(ent, css_id(&memcg->css)); 2855 swap_cgroup_record(ent, css_id(&memcg->css));
2856 } 2856 }
2857 #endif 2857 #endif
2858 2858
2859 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2859 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2860 /* 2860 /*
2861 * called from swap_entry_free(). remove record in swap_cgroup and 2861 * called from swap_entry_free(). remove record in swap_cgroup and
2862 * uncharge "memsw" account. 2862 * uncharge "memsw" account.
2863 */ 2863 */
2864 void mem_cgroup_uncharge_swap(swp_entry_t ent) 2864 void mem_cgroup_uncharge_swap(swp_entry_t ent)
2865 { 2865 {
2866 struct mem_cgroup *memcg; 2866 struct mem_cgroup *memcg;
2867 unsigned short id; 2867 unsigned short id;
2868 2868
2869 if (!do_swap_account) 2869 if (!do_swap_account)
2870 return; 2870 return;
2871 2871
2872 id = swap_cgroup_record(ent, 0); 2872 id = swap_cgroup_record(ent, 0);
2873 rcu_read_lock(); 2873 rcu_read_lock();
2874 memcg = mem_cgroup_lookup(id); 2874 memcg = mem_cgroup_lookup(id);
2875 if (memcg) { 2875 if (memcg) {
2876 /* 2876 /*
2877 * We uncharge this because swap is freed. 2877 * We uncharge this because swap is freed.
2878 * This memcg can be obsolete one. We avoid calling css_tryget 2878 * This memcg can be obsolete one. We avoid calling css_tryget
2879 */ 2879 */
2880 if (!mem_cgroup_is_root(memcg)) 2880 if (!mem_cgroup_is_root(memcg))
2881 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 2881 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
2882 mem_cgroup_swap_statistics(memcg, false); 2882 mem_cgroup_swap_statistics(memcg, false);
2883 mem_cgroup_put(memcg); 2883 mem_cgroup_put(memcg);
2884 } 2884 }
2885 rcu_read_unlock(); 2885 rcu_read_unlock();
2886 } 2886 }
2887 2887
2888 /** 2888 /**
2889 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 2889 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2890 * @entry: swap entry to be moved 2890 * @entry: swap entry to be moved
2891 * @from: mem_cgroup which the entry is moved from 2891 * @from: mem_cgroup which the entry is moved from
2892 * @to: mem_cgroup which the entry is moved to 2892 * @to: mem_cgroup which the entry is moved to
2893 * @need_fixup: whether we should fixup res_counters and refcounts. 2893 * @need_fixup: whether we should fixup res_counters and refcounts.
2894 * 2894 *
2895 * It succeeds only when the swap_cgroup's record for this entry is the same 2895 * It succeeds only when the swap_cgroup's record for this entry is the same
2896 * as the mem_cgroup's id of @from. 2896 * as the mem_cgroup's id of @from.
2897 * 2897 *
2898 * Returns 0 on success, -EINVAL on failure. 2898 * Returns 0 on success, -EINVAL on failure.
2899 * 2899 *
2900 * The caller must have charged to @to, IOW, called res_counter_charge() about 2900 * The caller must have charged to @to, IOW, called res_counter_charge() about
2901 * both res and memsw, and called css_get(). 2901 * both res and memsw, and called css_get().
2902 */ 2902 */
2903 static int mem_cgroup_move_swap_account(swp_entry_t entry, 2903 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2904 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2904 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2905 { 2905 {
2906 unsigned short old_id, new_id; 2906 unsigned short old_id, new_id;
2907 2907
2908 old_id = css_id(&from->css); 2908 old_id = css_id(&from->css);
2909 new_id = css_id(&to->css); 2909 new_id = css_id(&to->css);
2910 2910
2911 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 2911 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2912 mem_cgroup_swap_statistics(from, false); 2912 mem_cgroup_swap_statistics(from, false);
2913 mem_cgroup_swap_statistics(to, true); 2913 mem_cgroup_swap_statistics(to, true);
2914 /* 2914 /*
2915 * This function is only called from task migration context now. 2915 * This function is only called from task migration context now.
2916 * It postpones res_counter and refcount handling till the end 2916 * It postpones res_counter and refcount handling till the end
2917 * of task migration(mem_cgroup_clear_mc()) for performance 2917 * of task migration(mem_cgroup_clear_mc()) for performance
2918 * improvement. But we cannot postpone mem_cgroup_get(to) 2918 * improvement. But we cannot postpone mem_cgroup_get(to)
2919 * because if the process that has been moved to @to does 2919 * because if the process that has been moved to @to does
2920 * swap-in, the refcount of @to might be decreased to 0. 2920 * swap-in, the refcount of @to might be decreased to 0.
2921 */ 2921 */
2922 mem_cgroup_get(to); 2922 mem_cgroup_get(to);
2923 if (need_fixup) { 2923 if (need_fixup) {
2924 if (!mem_cgroup_is_root(from)) 2924 if (!mem_cgroup_is_root(from))
2925 res_counter_uncharge(&from->memsw, PAGE_SIZE); 2925 res_counter_uncharge(&from->memsw, PAGE_SIZE);
2926 mem_cgroup_put(from); 2926 mem_cgroup_put(from);
2927 /* 2927 /*
2928 * we charged both to->res and to->memsw, so we should 2928 * we charged both to->res and to->memsw, so we should
2929 * uncharge to->res. 2929 * uncharge to->res.
2930 */ 2930 */
2931 if (!mem_cgroup_is_root(to)) 2931 if (!mem_cgroup_is_root(to))
2932 res_counter_uncharge(&to->res, PAGE_SIZE); 2932 res_counter_uncharge(&to->res, PAGE_SIZE);
2933 } 2933 }
2934 return 0; 2934 return 0;
2935 } 2935 }
2936 return -EINVAL; 2936 return -EINVAL;
2937 } 2937 }
2938 #else 2938 #else
2939 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 2939 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2940 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 2940 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
2941 { 2941 {
2942 return -EINVAL; 2942 return -EINVAL;
2943 } 2943 }
2944 #endif 2944 #endif
2945 2945
2946 /* 2946 /*
2947 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 2947 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
2948 * page belongs to. 2948 * page belongs to.
2949 */ 2949 */
2950 int mem_cgroup_prepare_migration(struct page *page, 2950 int mem_cgroup_prepare_migration(struct page *page,
2951 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) 2951 struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
2952 { 2952 {
2953 struct mem_cgroup *mem = NULL; 2953 struct mem_cgroup *mem = NULL;
2954 struct page_cgroup *pc; 2954 struct page_cgroup *pc;
2955 enum charge_type ctype; 2955 enum charge_type ctype;
2956 int ret = 0; 2956 int ret = 0;
2957 2957
2958 *ptr = NULL; 2958 *ptr = NULL;
2959 2959
2960 VM_BUG_ON(PageTransHuge(page)); 2960 VM_BUG_ON(PageTransHuge(page));
2961 if (mem_cgroup_disabled()) 2961 if (mem_cgroup_disabled())
2962 return 0; 2962 return 0;
2963 2963
2964 pc = lookup_page_cgroup(page); 2964 pc = lookup_page_cgroup(page);
2965 lock_page_cgroup(pc); 2965 lock_page_cgroup(pc);
2966 if (PageCgroupUsed(pc)) { 2966 if (PageCgroupUsed(pc)) {
2967 mem = pc->mem_cgroup; 2967 mem = pc->mem_cgroup;
2968 css_get(&mem->css); 2968 css_get(&mem->css);
2969 /* 2969 /*
2970 * At migrating an anonymous page, its mapcount goes down 2970 * At migrating an anonymous page, its mapcount goes down
2971 * to 0 and uncharge() will be called. But, even if it's fully 2971 * to 0 and uncharge() will be called. But, even if it's fully
2972 * unmapped, migration may fail and this page has to be 2972 * unmapped, migration may fail and this page has to be
2973 * charged again. We set MIGRATION flag here and delay uncharge 2973 * charged again. We set MIGRATION flag here and delay uncharge
2974 * until end_migration() is called 2974 * until end_migration() is called
2975 * 2975 *
2976 * Corner Case Thinking 2976 * Corner Case Thinking
2977 * A) 2977 * A)
2978 * When the old page was mapped as Anon and it's unmap-and-freed 2978 * When the old page was mapped as Anon and it's unmap-and-freed
2979 * while migration was ongoing. 2979 * while migration was ongoing.
2980 * If unmap finds the old page, uncharge() of it will be delayed 2980 * If unmap finds the old page, uncharge() of it will be delayed
2981 * until end_migration(). If unmap finds a new page, it's 2981 * until end_migration(). If unmap finds a new page, it's
2982 * uncharged when it make mapcount to be 1->0. If unmap code 2982 * uncharged when it make mapcount to be 1->0. If unmap code
2983 * finds swap_migration_entry, the new page will not be mapped 2983 * finds swap_migration_entry, the new page will not be mapped
2984 * and end_migration() will find it(mapcount==0). 2984 * and end_migration() will find it(mapcount==0).
2985 * 2985 *
2986 * B) 2986 * B)
2987 * When the old page was mapped but migraion fails, the kernel 2987 * When the old page was mapped but migraion fails, the kernel
2988 * remaps it. A charge for it is kept by MIGRATION flag even 2988 * remaps it. A charge for it is kept by MIGRATION flag even
2989 * if mapcount goes down to 0. We can do remap successfully 2989 * if mapcount goes down to 0. We can do remap successfully
2990 * without charging it again. 2990 * without charging it again.
2991 * 2991 *
2992 * C) 2992 * C)
2993 * The "old" page is under lock_page() until the end of 2993 * The "old" page is under lock_page() until the end of
2994 * migration, so, the old page itself will not be swapped-out. 2994 * migration, so, the old page itself will not be swapped-out.
2995 * If the new page is swapped out before end_migraton, our 2995 * If the new page is swapped out before end_migraton, our
2996 * hook to usual swap-out path will catch the event. 2996 * hook to usual swap-out path will catch the event.
2997 */ 2997 */
2998 if (PageAnon(page)) 2998 if (PageAnon(page))
2999 SetPageCgroupMigration(pc); 2999 SetPageCgroupMigration(pc);
3000 } 3000 }
3001 unlock_page_cgroup(pc); 3001 unlock_page_cgroup(pc);
3002 /* 3002 /*
3003 * If the page is not charged at this point, 3003 * If the page is not charged at this point,
3004 * we return here. 3004 * we return here.
3005 */ 3005 */
3006 if (!mem) 3006 if (!mem)
3007 return 0; 3007 return 0;
3008 3008
3009 *ptr = mem; 3009 *ptr = mem;
3010 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); 3010 ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
3011 css_put(&mem->css);/* drop extra refcnt */ 3011 css_put(&mem->css);/* drop extra refcnt */
3012 if (ret || *ptr == NULL) { 3012 if (ret || *ptr == NULL) {
3013 if (PageAnon(page)) { 3013 if (PageAnon(page)) {
3014 lock_page_cgroup(pc); 3014 lock_page_cgroup(pc);
3015 ClearPageCgroupMigration(pc); 3015 ClearPageCgroupMigration(pc);
3016 unlock_page_cgroup(pc); 3016 unlock_page_cgroup(pc);
3017 /* 3017 /*
3018 * The old page may be fully unmapped while we kept it. 3018 * The old page may be fully unmapped while we kept it.
3019 */ 3019 */
3020 mem_cgroup_uncharge_page(page); 3020 mem_cgroup_uncharge_page(page);
3021 } 3021 }
3022 return -ENOMEM; 3022 return -ENOMEM;
3023 } 3023 }
3024 /* 3024 /*
3025 * We charge new page before it's used/mapped. So, even if unlock_page() 3025 * We charge new page before it's used/mapped. So, even if unlock_page()
3026 * is called before end_migration, we can catch all events on this new 3026 * is called before end_migration, we can catch all events on this new
3027 * page. In the case new page is migrated but not remapped, new page's 3027 * page. In the case new page is migrated but not remapped, new page's
3028 * mapcount will be finally 0 and we call uncharge in end_migration(). 3028 * mapcount will be finally 0 and we call uncharge in end_migration().
3029 */ 3029 */
3030 pc = lookup_page_cgroup(newpage); 3030 pc = lookup_page_cgroup(newpage);
3031 if (PageAnon(page)) 3031 if (PageAnon(page))
3032 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; 3032 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
3033 else if (page_is_file_cache(page)) 3033 else if (page_is_file_cache(page))
3034 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 3034 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
3035 else 3035 else
3036 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3036 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3037 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); 3037 __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
3038 return ret; 3038 return ret;
3039 } 3039 }
3040 3040
3041 /* remove redundant charge if migration failed*/ 3041 /* remove redundant charge if migration failed*/
3042 void mem_cgroup_end_migration(struct mem_cgroup *mem, 3042 void mem_cgroup_end_migration(struct mem_cgroup *mem,
3043 struct page *oldpage, struct page *newpage, bool migration_ok) 3043 struct page *oldpage, struct page *newpage, bool migration_ok)
3044 { 3044 {
3045 struct page *used, *unused; 3045 struct page *used, *unused;
3046 struct page_cgroup *pc; 3046 struct page_cgroup *pc;
3047 3047
3048 if (!mem) 3048 if (!mem)
3049 return; 3049 return;
3050 /* blocks rmdir() */ 3050 /* blocks rmdir() */
3051 cgroup_exclude_rmdir(&mem->css); 3051 cgroup_exclude_rmdir(&mem->css);
3052 if (!migration_ok) { 3052 if (!migration_ok) {
3053 used = oldpage; 3053 used = oldpage;
3054 unused = newpage; 3054 unused = newpage;
3055 } else { 3055 } else {
3056 used = newpage; 3056 used = newpage;
3057 unused = oldpage; 3057 unused = oldpage;
3058 } 3058 }
3059 /* 3059 /*
3060 * We disallowed uncharge of pages under migration because mapcount 3060 * We disallowed uncharge of pages under migration because mapcount
3061 * of the page goes down to zero, temporarly. 3061 * of the page goes down to zero, temporarly.
3062 * Clear the flag and check the page should be charged. 3062 * Clear the flag and check the page should be charged.
3063 */ 3063 */
3064 pc = lookup_page_cgroup(oldpage); 3064 pc = lookup_page_cgroup(oldpage);
3065 lock_page_cgroup(pc); 3065 lock_page_cgroup(pc);
3066 ClearPageCgroupMigration(pc); 3066 ClearPageCgroupMigration(pc);
3067 unlock_page_cgroup(pc); 3067 unlock_page_cgroup(pc);
3068 3068
3069 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); 3069 __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
3070 3070
3071 /* 3071 /*
3072 * If a page is a file cache, radix-tree replacement is very atomic 3072 * If a page is a file cache, radix-tree replacement is very atomic
3073 * and we can skip this check. When it was an Anon page, its mapcount 3073 * and we can skip this check. When it was an Anon page, its mapcount
3074 * goes down to 0. But because we added MIGRATION flage, it's not 3074 * goes down to 0. But because we added MIGRATION flage, it's not
3075 * uncharged yet. There are several case but page->mapcount check 3075 * uncharged yet. There are several case but page->mapcount check
3076 * and USED bit check in mem_cgroup_uncharge_page() will do enough 3076 * and USED bit check in mem_cgroup_uncharge_page() will do enough
3077 * check. (see prepare_charge() also) 3077 * check. (see prepare_charge() also)
3078 */ 3078 */
3079 if (PageAnon(used)) 3079 if (PageAnon(used))
3080 mem_cgroup_uncharge_page(used); 3080 mem_cgroup_uncharge_page(used);
3081 /* 3081 /*
3082 * At migration, we may charge account against cgroup which has no 3082 * At migration, we may charge account against cgroup which has no
3083 * tasks. 3083 * tasks.
3084 * So, rmdir()->pre_destroy() can be called while we do this charge. 3084 * So, rmdir()->pre_destroy() can be called while we do this charge.
3085 * In that case, we need to call pre_destroy() again. check it here. 3085 * In that case, we need to call pre_destroy() again. check it here.
3086 */ 3086 */
3087 cgroup_release_and_wakeup_rmdir(&mem->css); 3087 cgroup_release_and_wakeup_rmdir(&mem->css);
3088 } 3088 }
3089 3089
3090 /* 3090 /*
3091 * A call to try to shrink memory usage on charge failure at shmem's swapin. 3091 * A call to try to shrink memory usage on charge failure at shmem's swapin.
3092 * Calling hierarchical_reclaim is not enough because we should update 3092 * Calling hierarchical_reclaim is not enough because we should update
3093 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. 3093 * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
3094 * Moreover considering hierarchy, we should reclaim from the mem_over_limit, 3094 * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
3095 * not from the memcg which this page would be charged to. 3095 * not from the memcg which this page would be charged to.
3096 * try_charge_swapin does all of these works properly. 3096 * try_charge_swapin does all of these works properly.
3097 */ 3097 */
3098 int mem_cgroup_shmem_charge_fallback(struct page *page, 3098 int mem_cgroup_shmem_charge_fallback(struct page *page,
3099 struct mm_struct *mm, 3099 struct mm_struct *mm,
3100 gfp_t gfp_mask) 3100 gfp_t gfp_mask)
3101 { 3101 {
3102 struct mem_cgroup *mem; 3102 struct mem_cgroup *mem;
3103 int ret; 3103 int ret;
3104 3104
3105 if (mem_cgroup_disabled()) 3105 if (mem_cgroup_disabled())
3106 return 0; 3106 return 0;
3107 3107
3108 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); 3108 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
3109 if (!ret) 3109 if (!ret)
3110 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ 3110 mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
3111 3111
3112 return ret; 3112 return ret;
3113 } 3113 }
3114 3114
3115 #ifdef CONFIG_DEBUG_VM 3115 #ifdef CONFIG_DEBUG_VM
3116 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 3116 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3117 { 3117 {
3118 struct page_cgroup *pc; 3118 struct page_cgroup *pc;
3119 3119
3120 pc = lookup_page_cgroup(page); 3120 pc = lookup_page_cgroup(page);
3121 if (likely(pc) && PageCgroupUsed(pc)) 3121 if (likely(pc) && PageCgroupUsed(pc))
3122 return pc; 3122 return pc;
3123 return NULL; 3123 return NULL;
3124 } 3124 }
3125 3125
3126 bool mem_cgroup_bad_page_check(struct page *page) 3126 bool mem_cgroup_bad_page_check(struct page *page)
3127 { 3127 {
3128 if (mem_cgroup_disabled()) 3128 if (mem_cgroup_disabled())
3129 return false; 3129 return false;
3130 3130
3131 return lookup_page_cgroup_used(page) != NULL; 3131 return lookup_page_cgroup_used(page) != NULL;
3132 } 3132 }
3133 3133
3134 void mem_cgroup_print_bad_page(struct page *page) 3134 void mem_cgroup_print_bad_page(struct page *page)
3135 { 3135 {
3136 struct page_cgroup *pc; 3136 struct page_cgroup *pc;
3137 3137
3138 pc = lookup_page_cgroup_used(page); 3138 pc = lookup_page_cgroup_used(page);
3139 if (pc) { 3139 if (pc) {
3140 int ret = -1; 3140 int ret = -1;
3141 char *path; 3141 char *path;
3142 3142
3143 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p", 3143 printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
3144 pc, pc->flags, pc->mem_cgroup); 3144 pc, pc->flags, pc->mem_cgroup);
3145 3145
3146 path = kmalloc(PATH_MAX, GFP_KERNEL); 3146 path = kmalloc(PATH_MAX, GFP_KERNEL);
3147 if (path) { 3147 if (path) {
3148 rcu_read_lock(); 3148 rcu_read_lock();
3149 ret = cgroup_path(pc->mem_cgroup->css.cgroup, 3149 ret = cgroup_path(pc->mem_cgroup->css.cgroup,
3150 path, PATH_MAX); 3150 path, PATH_MAX);
3151 rcu_read_unlock(); 3151 rcu_read_unlock();
3152 } 3152 }
3153 3153
3154 printk(KERN_CONT "(%s)\n", 3154 printk(KERN_CONT "(%s)\n",
3155 (ret < 0) ? "cannot get the path" : path); 3155 (ret < 0) ? "cannot get the path" : path);
3156 kfree(path); 3156 kfree(path);
3157 } 3157 }
3158 } 3158 }
3159 #endif 3159 #endif
3160 3160
3161 static DEFINE_MUTEX(set_limit_mutex); 3161 static DEFINE_MUTEX(set_limit_mutex);
3162 3162
3163 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 3163 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
3164 unsigned long long val) 3164 unsigned long long val)
3165 { 3165 {
3166 int retry_count; 3166 int retry_count;
3167 u64 memswlimit, memlimit; 3167 u64 memswlimit, memlimit;
3168 int ret = 0; 3168 int ret = 0;
3169 int children = mem_cgroup_count_children(memcg); 3169 int children = mem_cgroup_count_children(memcg);
3170 u64 curusage, oldusage; 3170 u64 curusage, oldusage;
3171 int enlarge; 3171 int enlarge;
3172 3172
3173 /* 3173 /*
3174 * For keeping hierarchical_reclaim simple, how long we should retry 3174 * For keeping hierarchical_reclaim simple, how long we should retry
3175 * is depends on callers. We set our retry-count to be function 3175 * is depends on callers. We set our retry-count to be function
3176 * of # of children which we should visit in this loop. 3176 * of # of children which we should visit in this loop.
3177 */ 3177 */
3178 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 3178 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
3179 3179
3180 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3180 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3181 3181
3182 enlarge = 0; 3182 enlarge = 0;
3183 while (retry_count) { 3183 while (retry_count) {
3184 if (signal_pending(current)) { 3184 if (signal_pending(current)) {
3185 ret = -EINTR; 3185 ret = -EINTR;
3186 break; 3186 break;
3187 } 3187 }
3188 /* 3188 /*
3189 * Rather than hide all in some function, I do this in 3189 * Rather than hide all in some function, I do this in
3190 * open coded manner. You see what this really does. 3190 * open coded manner. You see what this really does.
3191 * We have to guarantee mem->res.limit < mem->memsw.limit. 3191 * We have to guarantee mem->res.limit < mem->memsw.limit.
3192 */ 3192 */
3193 mutex_lock(&set_limit_mutex); 3193 mutex_lock(&set_limit_mutex);
3194 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3194 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3195 if (memswlimit < val) { 3195 if (memswlimit < val) {
3196 ret = -EINVAL; 3196 ret = -EINVAL;
3197 mutex_unlock(&set_limit_mutex); 3197 mutex_unlock(&set_limit_mutex);
3198 break; 3198 break;
3199 } 3199 }
3200 3200
3201 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3201 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3202 if (memlimit < val) 3202 if (memlimit < val)
3203 enlarge = 1; 3203 enlarge = 1;
3204 3204
3205 ret = res_counter_set_limit(&memcg->res, val); 3205 ret = res_counter_set_limit(&memcg->res, val);
3206 if (!ret) { 3206 if (!ret) {
3207 if (memswlimit == val) 3207 if (memswlimit == val)
3208 memcg->memsw_is_minimum = true; 3208 memcg->memsw_is_minimum = true;
3209 else 3209 else
3210 memcg->memsw_is_minimum = false; 3210 memcg->memsw_is_minimum = false;
3211 } 3211 }
3212 mutex_unlock(&set_limit_mutex); 3212 mutex_unlock(&set_limit_mutex);
3213 3213
3214 if (!ret) 3214 if (!ret)
3215 break; 3215 break;
3216 3216
3217 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3217 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3218 MEM_CGROUP_RECLAIM_SHRINK, 3218 MEM_CGROUP_RECLAIM_SHRINK,
3219 NULL); 3219 NULL);
3220 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 3220 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
3221 /* Usage is reduced ? */ 3221 /* Usage is reduced ? */
3222 if (curusage >= oldusage) 3222 if (curusage >= oldusage)
3223 retry_count--; 3223 retry_count--;
3224 else 3224 else
3225 oldusage = curusage; 3225 oldusage = curusage;
3226 } 3226 }
3227 if (!ret && enlarge) 3227 if (!ret && enlarge)
3228 memcg_oom_recover(memcg); 3228 memcg_oom_recover(memcg);
3229 3229
3230 return ret; 3230 return ret;
3231 } 3231 }
3232 3232
3233 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 3233 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
3234 unsigned long long val) 3234 unsigned long long val)
3235 { 3235 {
3236 int retry_count; 3236 int retry_count;
3237 u64 memlimit, memswlimit, oldusage, curusage; 3237 u64 memlimit, memswlimit, oldusage, curusage;
3238 int children = mem_cgroup_count_children(memcg); 3238 int children = mem_cgroup_count_children(memcg);
3239 int ret = -EBUSY; 3239 int ret = -EBUSY;
3240 int enlarge = 0; 3240 int enlarge = 0;
3241 3241
3242 /* see mem_cgroup_resize_res_limit */ 3242 /* see mem_cgroup_resize_res_limit */
3243 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 3243 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
3244 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3244 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3245 while (retry_count) { 3245 while (retry_count) {
3246 if (signal_pending(current)) { 3246 if (signal_pending(current)) {
3247 ret = -EINTR; 3247 ret = -EINTR;
3248 break; 3248 break;
3249 } 3249 }
3250 /* 3250 /*
3251 * Rather than hide all in some function, I do this in 3251 * Rather than hide all in some function, I do this in
3252 * open coded manner. You see what this really does. 3252 * open coded manner. You see what this really does.
3253 * We have to guarantee mem->res.limit < mem->memsw.limit. 3253 * We have to guarantee mem->res.limit < mem->memsw.limit.
3254 */ 3254 */
3255 mutex_lock(&set_limit_mutex); 3255 mutex_lock(&set_limit_mutex);
3256 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3256 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3257 if (memlimit > val) { 3257 if (memlimit > val) {
3258 ret = -EINVAL; 3258 ret = -EINVAL;
3259 mutex_unlock(&set_limit_mutex); 3259 mutex_unlock(&set_limit_mutex);
3260 break; 3260 break;
3261 } 3261 }
3262 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3262 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3263 if (memswlimit < val) 3263 if (memswlimit < val)
3264 enlarge = 1; 3264 enlarge = 1;
3265 ret = res_counter_set_limit(&memcg->memsw, val); 3265 ret = res_counter_set_limit(&memcg->memsw, val);
3266 if (!ret) { 3266 if (!ret) {
3267 if (memlimit == val) 3267 if (memlimit == val)
3268 memcg->memsw_is_minimum = true; 3268 memcg->memsw_is_minimum = true;
3269 else 3269 else
3270 memcg->memsw_is_minimum = false; 3270 memcg->memsw_is_minimum = false;
3271 } 3271 }
3272 mutex_unlock(&set_limit_mutex); 3272 mutex_unlock(&set_limit_mutex);
3273 3273
3274 if (!ret) 3274 if (!ret)
3275 break; 3275 break;
3276 3276
3277 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, 3277 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
3278 MEM_CGROUP_RECLAIM_NOSWAP | 3278 MEM_CGROUP_RECLAIM_NOSWAP |
3279 MEM_CGROUP_RECLAIM_SHRINK, 3279 MEM_CGROUP_RECLAIM_SHRINK,
3280 NULL); 3280 NULL);
3281 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 3281 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
3282 /* Usage is reduced ? */ 3282 /* Usage is reduced ? */
3283 if (curusage >= oldusage) 3283 if (curusage >= oldusage)
3284 retry_count--; 3284 retry_count--;
3285 else 3285 else
3286 oldusage = curusage; 3286 oldusage = curusage;
3287 } 3287 }
3288 if (!ret && enlarge) 3288 if (!ret && enlarge)
3289 memcg_oom_recover(memcg); 3289 memcg_oom_recover(memcg);
3290 return ret; 3290 return ret;
3291 } 3291 }
3292 3292
3293 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 3293 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
3294 gfp_t gfp_mask, 3294 gfp_t gfp_mask,
3295 unsigned long *total_scanned) 3295 unsigned long *total_scanned)
3296 { 3296 {
3297 unsigned long nr_reclaimed = 0; 3297 unsigned long nr_reclaimed = 0;
3298 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 3298 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
3299 unsigned long reclaimed; 3299 unsigned long reclaimed;
3300 int loop = 0; 3300 int loop = 0;
3301 struct mem_cgroup_tree_per_zone *mctz; 3301 struct mem_cgroup_tree_per_zone *mctz;
3302 unsigned long long excess; 3302 unsigned long long excess;
3303 unsigned long nr_scanned; 3303 unsigned long nr_scanned;
3304 3304
3305 if (order > 0) 3305 if (order > 0)
3306 return 0; 3306 return 0;
3307 3307
3308 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 3308 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
3309 /* 3309 /*
3310 * This loop can run a while, specially if mem_cgroup's continuously 3310 * This loop can run a while, specially if mem_cgroup's continuously
3311 * keep exceeding their soft limit and putting the system under 3311 * keep exceeding their soft limit and putting the system under
3312 * pressure 3312 * pressure
3313 */ 3313 */
3314 do { 3314 do {
3315 if (next_mz) 3315 if (next_mz)
3316 mz = next_mz; 3316 mz = next_mz;
3317 else 3317 else
3318 mz = mem_cgroup_largest_soft_limit_node(mctz); 3318 mz = mem_cgroup_largest_soft_limit_node(mctz);
3319 if (!mz) 3319 if (!mz)
3320 break; 3320 break;
3321 3321
3322 nr_scanned = 0; 3322 nr_scanned = 0;
3323 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, 3323 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
3324 gfp_mask, 3324 gfp_mask,
3325 MEM_CGROUP_RECLAIM_SOFT, 3325 MEM_CGROUP_RECLAIM_SOFT,
3326 &nr_scanned); 3326 &nr_scanned);
3327 nr_reclaimed += reclaimed; 3327 nr_reclaimed += reclaimed;
3328 *total_scanned += nr_scanned; 3328 *total_scanned += nr_scanned;
3329 spin_lock(&mctz->lock); 3329 spin_lock(&mctz->lock);
3330 3330
3331 /* 3331 /*
3332 * If we failed to reclaim anything from this memory cgroup 3332 * If we failed to reclaim anything from this memory cgroup
3333 * it is time to move on to the next cgroup 3333 * it is time to move on to the next cgroup
3334 */ 3334 */
3335 next_mz = NULL; 3335 next_mz = NULL;
3336 if (!reclaimed) { 3336 if (!reclaimed) {
3337 do { 3337 do {
3338 /* 3338 /*
3339 * Loop until we find yet another one. 3339 * Loop until we find yet another one.
3340 * 3340 *
3341 * By the time we get the soft_limit lock 3341 * By the time we get the soft_limit lock
3342 * again, someone might have aded the 3342 * again, someone might have aded the
3343 * group back on the RB tree. Iterate to 3343 * group back on the RB tree. Iterate to
3344 * make sure we get a different mem. 3344 * make sure we get a different mem.
3345 * mem_cgroup_largest_soft_limit_node returns 3345 * mem_cgroup_largest_soft_limit_node returns
3346 * NULL if no other cgroup is present on 3346 * NULL if no other cgroup is present on
3347 * the tree 3347 * the tree
3348 */ 3348 */
3349 next_mz = 3349 next_mz =
3350 __mem_cgroup_largest_soft_limit_node(mctz); 3350 __mem_cgroup_largest_soft_limit_node(mctz);
3351 if (next_mz == mz) { 3351 if (next_mz == mz)
3352 css_put(&next_mz->mem->css); 3352 css_put(&next_mz->mem->css);
3353 next_mz = NULL; 3353 else /* next_mz == NULL or other memcg */
3354 } else /* next_mz == NULL or other memcg */
3355 break; 3354 break;
3356 } while (1); 3355 } while (1);
3357 } 3356 }
3358 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); 3357 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
3359 excess = res_counter_soft_limit_excess(&mz->mem->res); 3358 excess = res_counter_soft_limit_excess(&mz->mem->res);
3360 /* 3359 /*
3361 * One school of thought says that we should not add 3360 * One school of thought says that we should not add
3362 * back the node to the tree if reclaim returns 0. 3361 * back the node to the tree if reclaim returns 0.
3363 * But our reclaim could return 0, simply because due 3362 * But our reclaim could return 0, simply because due
3364 * to priority we are exposing a smaller subset of 3363 * to priority we are exposing a smaller subset of
3365 * memory to reclaim from. Consider this as a longer 3364 * memory to reclaim from. Consider this as a longer
3366 * term TODO. 3365 * term TODO.
3367 */ 3366 */
3368 /* If excess == 0, no tree ops */ 3367 /* If excess == 0, no tree ops */
3369 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); 3368 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
3370 spin_unlock(&mctz->lock); 3369 spin_unlock(&mctz->lock);
3371 css_put(&mz->mem->css); 3370 css_put(&mz->mem->css);
3372 loop++; 3371 loop++;
3373 /* 3372 /*
3374 * Could not reclaim anything and there are no more 3373 * Could not reclaim anything and there are no more
3375 * mem cgroups to try or we seem to be looping without 3374 * mem cgroups to try or we seem to be looping without
3376 * reclaiming anything. 3375 * reclaiming anything.
3377 */ 3376 */
3378 if (!nr_reclaimed && 3377 if (!nr_reclaimed &&
3379 (next_mz == NULL || 3378 (next_mz == NULL ||
3380 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 3379 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3381 break; 3380 break;
3382 } while (!nr_reclaimed); 3381 } while (!nr_reclaimed);
3383 if (next_mz) 3382 if (next_mz)
3384 css_put(&next_mz->mem->css); 3383 css_put(&next_mz->mem->css);
3385 return nr_reclaimed; 3384 return nr_reclaimed;
3386 } 3385 }
3387 3386
3388 /* 3387 /*
3389 * This routine traverse page_cgroup in given list and drop them all. 3388 * This routine traverse page_cgroup in given list and drop them all.
3390 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 3389 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
3391 */ 3390 */
3392 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, 3391 static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
3393 int node, int zid, enum lru_list lru) 3392 int node, int zid, enum lru_list lru)
3394 { 3393 {
3395 struct zone *zone; 3394 struct zone *zone;
3396 struct mem_cgroup_per_zone *mz; 3395 struct mem_cgroup_per_zone *mz;
3397 struct page_cgroup *pc, *busy; 3396 struct page_cgroup *pc, *busy;
3398 unsigned long flags, loop; 3397 unsigned long flags, loop;
3399 struct list_head *list; 3398 struct list_head *list;
3400 int ret = 0; 3399 int ret = 0;
3401 3400
3402 zone = &NODE_DATA(node)->node_zones[zid]; 3401 zone = &NODE_DATA(node)->node_zones[zid];
3403 mz = mem_cgroup_zoneinfo(mem, node, zid); 3402 mz = mem_cgroup_zoneinfo(mem, node, zid);
3404 list = &mz->lists[lru]; 3403 list = &mz->lists[lru];
3405 3404
3406 loop = MEM_CGROUP_ZSTAT(mz, lru); 3405 loop = MEM_CGROUP_ZSTAT(mz, lru);
3407 /* give some margin against EBUSY etc...*/ 3406 /* give some margin against EBUSY etc...*/
3408 loop += 256; 3407 loop += 256;
3409 busy = NULL; 3408 busy = NULL;
3410 while (loop--) { 3409 while (loop--) {
3411 struct page *page; 3410 struct page *page;
3412 3411
3413 ret = 0; 3412 ret = 0;
3414 spin_lock_irqsave(&zone->lru_lock, flags); 3413 spin_lock_irqsave(&zone->lru_lock, flags);
3415 if (list_empty(list)) { 3414 if (list_empty(list)) {
3416 spin_unlock_irqrestore(&zone->lru_lock, flags); 3415 spin_unlock_irqrestore(&zone->lru_lock, flags);
3417 break; 3416 break;
3418 } 3417 }
3419 pc = list_entry(list->prev, struct page_cgroup, lru); 3418 pc = list_entry(list->prev, struct page_cgroup, lru);
3420 if (busy == pc) { 3419 if (busy == pc) {
3421 list_move(&pc->lru, list); 3420 list_move(&pc->lru, list);
3422 busy = NULL; 3421 busy = NULL;
3423 spin_unlock_irqrestore(&zone->lru_lock, flags); 3422 spin_unlock_irqrestore(&zone->lru_lock, flags);
3424 continue; 3423 continue;
3425 } 3424 }
3426 spin_unlock_irqrestore(&zone->lru_lock, flags); 3425 spin_unlock_irqrestore(&zone->lru_lock, flags);
3427 3426
3428 page = lookup_cgroup_page(pc); 3427 page = lookup_cgroup_page(pc);
3429 3428
3430 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); 3429 ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
3431 if (ret == -ENOMEM) 3430 if (ret == -ENOMEM)
3432 break; 3431 break;
3433 3432
3434 if (ret == -EBUSY || ret == -EINVAL) { 3433 if (ret == -EBUSY || ret == -EINVAL) {
3435 /* found lock contention or "pc" is obsolete. */ 3434 /* found lock contention or "pc" is obsolete. */
3436 busy = pc; 3435 busy = pc;
3437 cond_resched(); 3436 cond_resched();
3438 } else 3437 } else
3439 busy = NULL; 3438 busy = NULL;
3440 } 3439 }
3441 3440
3442 if (!ret && !list_empty(list)) 3441 if (!ret && !list_empty(list))
3443 return -EBUSY; 3442 return -EBUSY;
3444 return ret; 3443 return ret;
3445 } 3444 }
3446 3445
3447 /* 3446 /*
3448 * make mem_cgroup's charge to be 0 if there is no task. 3447 * make mem_cgroup's charge to be 0 if there is no task.
3449 * This enables deleting this mem_cgroup. 3448 * This enables deleting this mem_cgroup.
3450 */ 3449 */
3451 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) 3450 static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
3452 { 3451 {
3453 int ret; 3452 int ret;
3454 int node, zid, shrink; 3453 int node, zid, shrink;
3455 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 3454 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3456 struct cgroup *cgrp = mem->css.cgroup; 3455 struct cgroup *cgrp = mem->css.cgroup;
3457 3456
3458 css_get(&mem->css); 3457 css_get(&mem->css);
3459 3458
3460 shrink = 0; 3459 shrink = 0;
3461 /* should free all ? */ 3460 /* should free all ? */
3462 if (free_all) 3461 if (free_all)
3463 goto try_to_free; 3462 goto try_to_free;
3464 move_account: 3463 move_account:
3465 do { 3464 do {
3466 ret = -EBUSY; 3465 ret = -EBUSY;
3467 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 3466 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
3468 goto out; 3467 goto out;
3469 ret = -EINTR; 3468 ret = -EINTR;
3470 if (signal_pending(current)) 3469 if (signal_pending(current))
3471 goto out; 3470 goto out;
3472 /* This is for making all *used* pages to be on LRU. */ 3471 /* This is for making all *used* pages to be on LRU. */
3473 lru_add_drain_all(); 3472 lru_add_drain_all();
3474 drain_all_stock_sync(); 3473 drain_all_stock_sync();
3475 ret = 0; 3474 ret = 0;
3476 mem_cgroup_start_move(mem); 3475 mem_cgroup_start_move(mem);
3477 for_each_node_state(node, N_HIGH_MEMORY) { 3476 for_each_node_state(node, N_HIGH_MEMORY) {
3478 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { 3477 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
3479 enum lru_list l; 3478 enum lru_list l;
3480 for_each_lru(l) { 3479 for_each_lru(l) {
3481 ret = mem_cgroup_force_empty_list(mem, 3480 ret = mem_cgroup_force_empty_list(mem,
3482 node, zid, l); 3481 node, zid, l);
3483 if (ret) 3482 if (ret)
3484 break; 3483 break;
3485 } 3484 }
3486 } 3485 }
3487 if (ret) 3486 if (ret)
3488 break; 3487 break;
3489 } 3488 }
3490 mem_cgroup_end_move(mem); 3489 mem_cgroup_end_move(mem);
3491 memcg_oom_recover(mem); 3490 memcg_oom_recover(mem);
3492 /* it seems parent cgroup doesn't have enough mem */ 3491 /* it seems parent cgroup doesn't have enough mem */
3493 if (ret == -ENOMEM) 3492 if (ret == -ENOMEM)
3494 goto try_to_free; 3493 goto try_to_free;
3495 cond_resched(); 3494 cond_resched();
3496 /* "ret" should also be checked to ensure all lists are empty. */ 3495 /* "ret" should also be checked to ensure all lists are empty. */
3497 } while (mem->res.usage > 0 || ret); 3496 } while (mem->res.usage > 0 || ret);
3498 out: 3497 out:
3499 css_put(&mem->css); 3498 css_put(&mem->css);
3500 return ret; 3499 return ret;
3501 3500
3502 try_to_free: 3501 try_to_free:
3503 /* returns EBUSY if there is a task or if we come here twice. */ 3502 /* returns EBUSY if there is a task or if we come here twice. */
3504 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) { 3503 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
3505 ret = -EBUSY; 3504 ret = -EBUSY;
3506 goto out; 3505 goto out;
3507 } 3506 }
3508 /* we call try-to-free pages for make this cgroup empty */ 3507 /* we call try-to-free pages for make this cgroup empty */
3509 lru_add_drain_all(); 3508 lru_add_drain_all();
3510 /* try to free all pages in this cgroup */ 3509 /* try to free all pages in this cgroup */
3511 shrink = 1; 3510 shrink = 1;
3512 while (nr_retries && mem->res.usage > 0) { 3511 while (nr_retries && mem->res.usage > 0) {
3513 int progress; 3512 int progress;
3514 3513
3515 if (signal_pending(current)) { 3514 if (signal_pending(current)) {
3516 ret = -EINTR; 3515 ret = -EINTR;
3517 goto out; 3516 goto out;
3518 } 3517 }
3519 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, 3518 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
3520 false, get_swappiness(mem)); 3519 false, get_swappiness(mem));
3521 if (!progress) { 3520 if (!progress) {
3522 nr_retries--; 3521 nr_retries--;
3523 /* maybe some writeback is necessary */ 3522 /* maybe some writeback is necessary */
3524 congestion_wait(BLK_RW_ASYNC, HZ/10); 3523 congestion_wait(BLK_RW_ASYNC, HZ/10);
3525 } 3524 }
3526 3525
3527 } 3526 }
3528 lru_add_drain(); 3527 lru_add_drain();
3529 /* try move_account...there may be some *locked* pages. */ 3528 /* try move_account...there may be some *locked* pages. */
3530 goto move_account; 3529 goto move_account;
3531 } 3530 }
3532 3531
3533 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3532 int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3534 { 3533 {
3535 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3534 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3536 } 3535 }
3537 3536
3538 3537
3539 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) 3538 static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
3540 { 3539 {
3541 return mem_cgroup_from_cont(cont)->use_hierarchy; 3540 return mem_cgroup_from_cont(cont)->use_hierarchy;
3542 } 3541 }
3543 3542
3544 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, 3543 static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
3545 u64 val) 3544 u64 val)
3546 { 3545 {
3547 int retval = 0; 3546 int retval = 0;
3548 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3547 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3549 struct cgroup *parent = cont->parent; 3548 struct cgroup *parent = cont->parent;
3550 struct mem_cgroup *parent_mem = NULL; 3549 struct mem_cgroup *parent_mem = NULL;
3551 3550
3552 if (parent) 3551 if (parent)
3553 parent_mem = mem_cgroup_from_cont(parent); 3552 parent_mem = mem_cgroup_from_cont(parent);
3554 3553
3555 cgroup_lock(); 3554 cgroup_lock();
3556 /* 3555 /*
3557 * If parent's use_hierarchy is set, we can't make any modifications 3556 * If parent's use_hierarchy is set, we can't make any modifications
3558 * in the child subtrees. If it is unset, then the change can 3557 * in the child subtrees. If it is unset, then the change can
3559 * occur, provided the current cgroup has no children. 3558 * occur, provided the current cgroup has no children.
3560 * 3559 *
3561 * For the root cgroup, parent_mem is NULL, we allow value to be 3560 * For the root cgroup, parent_mem is NULL, we allow value to be
3562 * set if there are no children. 3561 * set if there are no children.
3563 */ 3562 */
3564 if ((!parent_mem || !parent_mem->use_hierarchy) && 3563 if ((!parent_mem || !parent_mem->use_hierarchy) &&
3565 (val == 1 || val == 0)) { 3564 (val == 1 || val == 0)) {
3566 if (list_empty(&cont->children)) 3565 if (list_empty(&cont->children))
3567 mem->use_hierarchy = val; 3566 mem->use_hierarchy = val;
3568 else 3567 else
3569 retval = -EBUSY; 3568 retval = -EBUSY;
3570 } else 3569 } else
3571 retval = -EINVAL; 3570 retval = -EINVAL;
3572 cgroup_unlock(); 3571 cgroup_unlock();
3573 3572
3574 return retval; 3573 return retval;
3575 } 3574 }
3576 3575
3577 3576
3578 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, 3577 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
3579 enum mem_cgroup_stat_index idx) 3578 enum mem_cgroup_stat_index idx)
3580 { 3579 {
3581 struct mem_cgroup *iter; 3580 struct mem_cgroup *iter;
3582 long val = 0; 3581 long val = 0;
3583 3582
3584 /* Per-cpu values can be negative, use a signed accumulator */ 3583 /* Per-cpu values can be negative, use a signed accumulator */
3585 for_each_mem_cgroup_tree(iter, mem) 3584 for_each_mem_cgroup_tree(iter, mem)
3586 val += mem_cgroup_read_stat(iter, idx); 3585 val += mem_cgroup_read_stat(iter, idx);
3587 3586
3588 if (val < 0) /* race ? */ 3587 if (val < 0) /* race ? */
3589 val = 0; 3588 val = 0;
3590 return val; 3589 return val;
3591 } 3590 }
3592 3591
3593 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) 3592 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
3594 { 3593 {
3595 u64 val; 3594 u64 val;
3596 3595
3597 if (!mem_cgroup_is_root(mem)) { 3596 if (!mem_cgroup_is_root(mem)) {
3598 if (!swap) 3597 if (!swap)
3599 return res_counter_read_u64(&mem->res, RES_USAGE); 3598 return res_counter_read_u64(&mem->res, RES_USAGE);
3600 else 3599 else
3601 return res_counter_read_u64(&mem->memsw, RES_USAGE); 3600 return res_counter_read_u64(&mem->memsw, RES_USAGE);
3602 } 3601 }
3603 3602
3604 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); 3603 val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
3605 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); 3604 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
3606 3605
3607 if (swap) 3606 if (swap)
3608 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3607 val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3609 3608
3610 return val << PAGE_SHIFT; 3609 return val << PAGE_SHIFT;
3611 } 3610 }
3612 3611
3613 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3612 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3614 { 3613 {
3615 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 3614 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
3616 u64 val; 3615 u64 val;
3617 int type, name; 3616 int type, name;
3618 3617
3619 type = MEMFILE_TYPE(cft->private); 3618 type = MEMFILE_TYPE(cft->private);
3620 name = MEMFILE_ATTR(cft->private); 3619 name = MEMFILE_ATTR(cft->private);
3621 switch (type) { 3620 switch (type) {
3622 case _MEM: 3621 case _MEM:
3623 if (name == RES_USAGE) 3622 if (name == RES_USAGE)
3624 val = mem_cgroup_usage(mem, false); 3623 val = mem_cgroup_usage(mem, false);
3625 else 3624 else
3626 val = res_counter_read_u64(&mem->res, name); 3625 val = res_counter_read_u64(&mem->res, name);
3627 break; 3626 break;
3628 case _MEMSWAP: 3627 case _MEMSWAP:
3629 if (name == RES_USAGE) 3628 if (name == RES_USAGE)
3630 val = mem_cgroup_usage(mem, true); 3629 val = mem_cgroup_usage(mem, true);
3631 else 3630 else
3632 val = res_counter_read_u64(&mem->memsw, name); 3631 val = res_counter_read_u64(&mem->memsw, name);
3633 break; 3632 break;
3634 default: 3633 default:
3635 BUG(); 3634 BUG();
3636 break; 3635 break;
3637 } 3636 }
3638 return val; 3637 return val;
3639 } 3638 }
3640 /* 3639 /*
3641 * The user of this function is... 3640 * The user of this function is...
3642 * RES_LIMIT. 3641 * RES_LIMIT.
3643 */ 3642 */
3644 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, 3643 static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3645 const char *buffer) 3644 const char *buffer)
3646 { 3645 {
3647 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3646 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3648 int type, name; 3647 int type, name;
3649 unsigned long long val; 3648 unsigned long long val;
3650 int ret; 3649 int ret;
3651 3650
3652 type = MEMFILE_TYPE(cft->private); 3651 type = MEMFILE_TYPE(cft->private);
3653 name = MEMFILE_ATTR(cft->private); 3652 name = MEMFILE_ATTR(cft->private);
3654 switch (name) { 3653 switch (name) {
3655 case RES_LIMIT: 3654 case RES_LIMIT:
3656 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3655 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3657 ret = -EINVAL; 3656 ret = -EINVAL;
3658 break; 3657 break;
3659 } 3658 }
3660 /* This function does all necessary parse...reuse it */ 3659 /* This function does all necessary parse...reuse it */
3661 ret = res_counter_memparse_write_strategy(buffer, &val); 3660 ret = res_counter_memparse_write_strategy(buffer, &val);
3662 if (ret) 3661 if (ret)
3663 break; 3662 break;
3664 if (type == _MEM) 3663 if (type == _MEM)
3665 ret = mem_cgroup_resize_limit(memcg, val); 3664 ret = mem_cgroup_resize_limit(memcg, val);
3666 else 3665 else
3667 ret = mem_cgroup_resize_memsw_limit(memcg, val); 3666 ret = mem_cgroup_resize_memsw_limit(memcg, val);
3668 break; 3667 break;
3669 case RES_SOFT_LIMIT: 3668 case RES_SOFT_LIMIT:
3670 ret = res_counter_memparse_write_strategy(buffer, &val); 3669 ret = res_counter_memparse_write_strategy(buffer, &val);
3671 if (ret) 3670 if (ret)
3672 break; 3671 break;
3673 /* 3672 /*
3674 * For memsw, soft limits are hard to implement in terms 3673 * For memsw, soft limits are hard to implement in terms
3675 * of semantics, for now, we support soft limits for 3674 * of semantics, for now, we support soft limits for
3676 * control without swap 3675 * control without swap
3677 */ 3676 */
3678 if (type == _MEM) 3677 if (type == _MEM)
3679 ret = res_counter_set_soft_limit(&memcg->res, val); 3678 ret = res_counter_set_soft_limit(&memcg->res, val);
3680 else 3679 else
3681 ret = -EINVAL; 3680 ret = -EINVAL;
3682 break; 3681 break;
3683 default: 3682 default:
3684 ret = -EINVAL; /* should be BUG() ? */ 3683 ret = -EINVAL; /* should be BUG() ? */
3685 break; 3684 break;
3686 } 3685 }
3687 return ret; 3686 return ret;
3688 } 3687 }
3689 3688
3690 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 3689 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
3691 unsigned long long *mem_limit, unsigned long long *memsw_limit) 3690 unsigned long long *mem_limit, unsigned long long *memsw_limit)
3692 { 3691 {
3693 struct cgroup *cgroup; 3692 struct cgroup *cgroup;
3694 unsigned long long min_limit, min_memsw_limit, tmp; 3693 unsigned long long min_limit, min_memsw_limit, tmp;
3695 3694
3696 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 3695 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
3697 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3696 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3698 cgroup = memcg->css.cgroup; 3697 cgroup = memcg->css.cgroup;
3699 if (!memcg->use_hierarchy) 3698 if (!memcg->use_hierarchy)
3700 goto out; 3699 goto out;
3701 3700
3702 while (cgroup->parent) { 3701 while (cgroup->parent) {
3703 cgroup = cgroup->parent; 3702 cgroup = cgroup->parent;
3704 memcg = mem_cgroup_from_cont(cgroup); 3703 memcg = mem_cgroup_from_cont(cgroup);
3705 if (!memcg->use_hierarchy) 3704 if (!memcg->use_hierarchy)
3706 break; 3705 break;
3707 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 3706 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
3708 min_limit = min(min_limit, tmp); 3707 min_limit = min(min_limit, tmp);
3709 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 3708 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
3710 min_memsw_limit = min(min_memsw_limit, tmp); 3709 min_memsw_limit = min(min_memsw_limit, tmp);
3711 } 3710 }
3712 out: 3711 out:
3713 *mem_limit = min_limit; 3712 *mem_limit = min_limit;
3714 *memsw_limit = min_memsw_limit; 3713 *memsw_limit = min_memsw_limit;
3715 return; 3714 return;
3716 } 3715 }
3717 3716
3718 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3717 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3719 { 3718 {
3720 struct mem_cgroup *mem; 3719 struct mem_cgroup *mem;
3721 int type, name; 3720 int type, name;
3722 3721
3723 mem = mem_cgroup_from_cont(cont); 3722 mem = mem_cgroup_from_cont(cont);
3724 type = MEMFILE_TYPE(event); 3723 type = MEMFILE_TYPE(event);
3725 name = MEMFILE_ATTR(event); 3724 name = MEMFILE_ATTR(event);
3726 switch (name) { 3725 switch (name) {
3727 case RES_MAX_USAGE: 3726 case RES_MAX_USAGE:
3728 if (type == _MEM) 3727 if (type == _MEM)
3729 res_counter_reset_max(&mem->res); 3728 res_counter_reset_max(&mem->res);
3730 else 3729 else
3731 res_counter_reset_max(&mem->memsw); 3730 res_counter_reset_max(&mem->memsw);
3732 break; 3731 break;
3733 case RES_FAILCNT: 3732 case RES_FAILCNT:
3734 if (type == _MEM) 3733 if (type == _MEM)
3735 res_counter_reset_failcnt(&mem->res); 3734 res_counter_reset_failcnt(&mem->res);
3736 else 3735 else
3737 res_counter_reset_failcnt(&mem->memsw); 3736 res_counter_reset_failcnt(&mem->memsw);
3738 break; 3737 break;
3739 } 3738 }
3740 3739
3741 return 0; 3740 return 0;
3742 } 3741 }
3743 3742
3744 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, 3743 static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
3745 struct cftype *cft) 3744 struct cftype *cft)
3746 { 3745 {
3747 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; 3746 return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
3748 } 3747 }
3749 3748
3750 #ifdef CONFIG_MMU 3749 #ifdef CONFIG_MMU
3751 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3750 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3752 struct cftype *cft, u64 val) 3751 struct cftype *cft, u64 val)
3753 { 3752 {
3754 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 3753 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
3755 3754
3756 if (val >= (1 << NR_MOVE_TYPE)) 3755 if (val >= (1 << NR_MOVE_TYPE))
3757 return -EINVAL; 3756 return -EINVAL;
3758 /* 3757 /*
3759 * We check this value several times in both in can_attach() and 3758 * We check this value several times in both in can_attach() and
3760 * attach(), so we need cgroup lock to prevent this value from being 3759 * attach(), so we need cgroup lock to prevent this value from being
3761 * inconsistent. 3760 * inconsistent.
3762 */ 3761 */
3763 cgroup_lock(); 3762 cgroup_lock();
3764 mem->move_charge_at_immigrate = val; 3763 mem->move_charge_at_immigrate = val;
3765 cgroup_unlock(); 3764 cgroup_unlock();
3766 3765
3767 return 0; 3766 return 0;
3768 } 3767 }
3769 #else 3768 #else
3770 static int mem_cgroup_move_charge_write(struct cgroup *cgrp, 3769 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
3771 struct cftype *cft, u64 val) 3770 struct cftype *cft, u64 val)
3772 { 3771 {
3773 return -ENOSYS; 3772 return -ENOSYS;
3774 } 3773 }
3775 #endif 3774 #endif
3776 3775
3777 3776
3778 /* For read statistics */ 3777 /* For read statistics */
3779 enum { 3778 enum {
3780 MCS_CACHE, 3779 MCS_CACHE,
3781 MCS_RSS, 3780 MCS_RSS,
3782 MCS_FILE_MAPPED, 3781 MCS_FILE_MAPPED,
3783 MCS_PGPGIN, 3782 MCS_PGPGIN,
3784 MCS_PGPGOUT, 3783 MCS_PGPGOUT,
3785 MCS_SWAP, 3784 MCS_SWAP,
3786 MCS_INACTIVE_ANON, 3785 MCS_INACTIVE_ANON,
3787 MCS_ACTIVE_ANON, 3786 MCS_ACTIVE_ANON,
3788 MCS_INACTIVE_FILE, 3787 MCS_INACTIVE_FILE,
3789 MCS_ACTIVE_FILE, 3788 MCS_ACTIVE_FILE,
3790 MCS_UNEVICTABLE, 3789 MCS_UNEVICTABLE,
3791 NR_MCS_STAT, 3790 NR_MCS_STAT,
3792 }; 3791 };
3793 3792
3794 struct mcs_total_stat { 3793 struct mcs_total_stat {
3795 s64 stat[NR_MCS_STAT]; 3794 s64 stat[NR_MCS_STAT];
3796 }; 3795 };
3797 3796
3798 struct { 3797 struct {
3799 char *local_name; 3798 char *local_name;
3800 char *total_name; 3799 char *total_name;
3801 } memcg_stat_strings[NR_MCS_STAT] = { 3800 } memcg_stat_strings[NR_MCS_STAT] = {
3802 {"cache", "total_cache"}, 3801 {"cache", "total_cache"},
3803 {"rss", "total_rss"}, 3802 {"rss", "total_rss"},
3804 {"mapped_file", "total_mapped_file"}, 3803 {"mapped_file", "total_mapped_file"},
3805 {"pgpgin", "total_pgpgin"}, 3804 {"pgpgin", "total_pgpgin"},
3806 {"pgpgout", "total_pgpgout"}, 3805 {"pgpgout", "total_pgpgout"},
3807 {"swap", "total_swap"}, 3806 {"swap", "total_swap"},
3808 {"inactive_anon", "total_inactive_anon"}, 3807 {"inactive_anon", "total_inactive_anon"},
3809 {"active_anon", "total_active_anon"}, 3808 {"active_anon", "total_active_anon"},
3810 {"inactive_file", "total_inactive_file"}, 3809 {"inactive_file", "total_inactive_file"},
3811 {"active_file", "total_active_file"}, 3810 {"active_file", "total_active_file"},
3812 {"unevictable", "total_unevictable"} 3811 {"unevictable", "total_unevictable"}
3813 }; 3812 };
3814 3813
3815 3814
3816 static void 3815 static void
3817 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3816 mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3818 { 3817 {
3819 s64 val; 3818 s64 val;
3820 3819
3821 /* per cpu stat */ 3820 /* per cpu stat */
3822 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); 3821 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
3823 s->stat[MCS_CACHE] += val * PAGE_SIZE; 3822 s->stat[MCS_CACHE] += val * PAGE_SIZE;
3824 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); 3823 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
3825 s->stat[MCS_RSS] += val * PAGE_SIZE; 3824 s->stat[MCS_RSS] += val * PAGE_SIZE;
3826 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); 3825 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
3827 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; 3826 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
3828 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); 3827 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
3829 s->stat[MCS_PGPGIN] += val; 3828 s->stat[MCS_PGPGIN] += val;
3830 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); 3829 val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
3831 s->stat[MCS_PGPGOUT] += val; 3830 s->stat[MCS_PGPGOUT] += val;
3832 if (do_swap_account) { 3831 if (do_swap_account) {
3833 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); 3832 val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
3834 s->stat[MCS_SWAP] += val * PAGE_SIZE; 3833 s->stat[MCS_SWAP] += val * PAGE_SIZE;
3835 } 3834 }
3836 3835
3837 /* per zone stat */ 3836 /* per zone stat */
3838 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 3837 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
3839 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; 3838 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
3840 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); 3839 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
3841 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; 3840 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
3842 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); 3841 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
3843 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; 3842 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
3844 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); 3843 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
3845 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; 3844 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
3846 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); 3845 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
3847 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; 3846 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
3848 } 3847 }
3849 3848
3850 static void 3849 static void
3851 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) 3850 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
3852 { 3851 {
3853 struct mem_cgroup *iter; 3852 struct mem_cgroup *iter;
3854 3853
3855 for_each_mem_cgroup_tree(iter, mem) 3854 for_each_mem_cgroup_tree(iter, mem)
3856 mem_cgroup_get_local_stat(iter, s); 3855 mem_cgroup_get_local_stat(iter, s);
3857 } 3856 }
3858 3857
3859 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 3858 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
3860 struct cgroup_map_cb *cb) 3859 struct cgroup_map_cb *cb)
3861 { 3860 {
3862 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 3861 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
3863 struct mcs_total_stat mystat; 3862 struct mcs_total_stat mystat;
3864 int i; 3863 int i;
3865 3864
3866 memset(&mystat, 0, sizeof(mystat)); 3865 memset(&mystat, 0, sizeof(mystat));
3867 mem_cgroup_get_local_stat(mem_cont, &mystat); 3866 mem_cgroup_get_local_stat(mem_cont, &mystat);
3868 3867
3869 for (i = 0; i < NR_MCS_STAT; i++) { 3868 for (i = 0; i < NR_MCS_STAT; i++) {
3870 if (i == MCS_SWAP && !do_swap_account) 3869 if (i == MCS_SWAP && !do_swap_account)
3871 continue; 3870 continue;
3872 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 3871 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
3873 } 3872 }
3874 3873
3875 /* Hierarchical information */ 3874 /* Hierarchical information */
3876 { 3875 {
3877 unsigned long long limit, memsw_limit; 3876 unsigned long long limit, memsw_limit;
3878 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 3877 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
3879 cb->fill(cb, "hierarchical_memory_limit", limit); 3878 cb->fill(cb, "hierarchical_memory_limit", limit);
3880 if (do_swap_account) 3879 if (do_swap_account)
3881 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 3880 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
3882 } 3881 }
3883 3882
3884 memset(&mystat, 0, sizeof(mystat)); 3883 memset(&mystat, 0, sizeof(mystat));
3885 mem_cgroup_get_total_stat(mem_cont, &mystat); 3884 mem_cgroup_get_total_stat(mem_cont, &mystat);
3886 for (i = 0; i < NR_MCS_STAT; i++) { 3885 for (i = 0; i < NR_MCS_STAT; i++) {
3887 if (i == MCS_SWAP && !do_swap_account) 3886 if (i == MCS_SWAP && !do_swap_account)
3888 continue; 3887 continue;
3889 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 3888 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
3890 } 3889 }
3891 3890
3892 #ifdef CONFIG_DEBUG_VM 3891 #ifdef CONFIG_DEBUG_VM
3893 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 3892 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
3894 3893
3895 { 3894 {
3896 int nid, zid; 3895 int nid, zid;
3897 struct mem_cgroup_per_zone *mz; 3896 struct mem_cgroup_per_zone *mz;
3898 unsigned long recent_rotated[2] = {0, 0}; 3897 unsigned long recent_rotated[2] = {0, 0};
3899 unsigned long recent_scanned[2] = {0, 0}; 3898 unsigned long recent_scanned[2] = {0, 0};
3900 3899
3901 for_each_online_node(nid) 3900 for_each_online_node(nid)
3902 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 3901 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3903 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 3902 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
3904 3903
3905 recent_rotated[0] += 3904 recent_rotated[0] +=
3906 mz->reclaim_stat.recent_rotated[0]; 3905 mz->reclaim_stat.recent_rotated[0];
3907 recent_rotated[1] += 3906 recent_rotated[1] +=
3908 mz->reclaim_stat.recent_rotated[1]; 3907 mz->reclaim_stat.recent_rotated[1];
3909 recent_scanned[0] += 3908 recent_scanned[0] +=
3910 mz->reclaim_stat.recent_scanned[0]; 3909 mz->reclaim_stat.recent_scanned[0];
3911 recent_scanned[1] += 3910 recent_scanned[1] +=
3912 mz->reclaim_stat.recent_scanned[1]; 3911 mz->reclaim_stat.recent_scanned[1];
3913 } 3912 }
3914 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 3913 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
3915 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 3914 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
3916 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 3915 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
3917 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 3916 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
3918 } 3917 }
3919 #endif 3918 #endif
3920 3919
3921 return 0; 3920 return 0;
3922 } 3921 }
3923 3922
3924 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) 3923 static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
3925 { 3924 {
3926 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3925 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3927 3926
3928 return get_swappiness(memcg); 3927 return get_swappiness(memcg);
3929 } 3928 }
3930 3929
3931 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, 3930 static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
3932 u64 val) 3931 u64 val)
3933 { 3932 {
3934 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 3933 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
3935 struct mem_cgroup *parent; 3934 struct mem_cgroup *parent;
3936 3935
3937 if (val > 100) 3936 if (val > 100)
3938 return -EINVAL; 3937 return -EINVAL;
3939 3938
3940 if (cgrp->parent == NULL) 3939 if (cgrp->parent == NULL)
3941 return -EINVAL; 3940 return -EINVAL;
3942 3941
3943 parent = mem_cgroup_from_cont(cgrp->parent); 3942 parent = mem_cgroup_from_cont(cgrp->parent);
3944 3943
3945 cgroup_lock(); 3944 cgroup_lock();
3946 3945
3947 /* If under hierarchy, only empty-root can set this value */ 3946 /* If under hierarchy, only empty-root can set this value */
3948 if ((parent->use_hierarchy) || 3947 if ((parent->use_hierarchy) ||
3949 (memcg->use_hierarchy && !list_empty(&cgrp->children))) { 3948 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
3950 cgroup_unlock(); 3949 cgroup_unlock();
3951 return -EINVAL; 3950 return -EINVAL;
3952 } 3951 }
3953 3952
3954 memcg->swappiness = val; 3953 memcg->swappiness = val;
3955 3954
3956 cgroup_unlock(); 3955 cgroup_unlock();
3957 3956
3958 return 0; 3957 return 0;
3959 } 3958 }
3960 3959
3961 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 3960 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3962 { 3961 {
3963 struct mem_cgroup_threshold_ary *t; 3962 struct mem_cgroup_threshold_ary *t;
3964 u64 usage; 3963 u64 usage;
3965 int i; 3964 int i;
3966 3965
3967 rcu_read_lock(); 3966 rcu_read_lock();
3968 if (!swap) 3967 if (!swap)
3969 t = rcu_dereference(memcg->thresholds.primary); 3968 t = rcu_dereference(memcg->thresholds.primary);
3970 else 3969 else
3971 t = rcu_dereference(memcg->memsw_thresholds.primary); 3970 t = rcu_dereference(memcg->memsw_thresholds.primary);
3972 3971
3973 if (!t) 3972 if (!t)
3974 goto unlock; 3973 goto unlock;
3975 3974
3976 usage = mem_cgroup_usage(memcg, swap); 3975 usage = mem_cgroup_usage(memcg, swap);
3977 3976
3978 /* 3977 /*
3979 * current_threshold points to threshold just below usage. 3978 * current_threshold points to threshold just below usage.
3980 * If it's not true, a threshold was crossed after last 3979 * If it's not true, a threshold was crossed after last
3981 * call of __mem_cgroup_threshold(). 3980 * call of __mem_cgroup_threshold().
3982 */ 3981 */
3983 i = t->current_threshold; 3982 i = t->current_threshold;
3984 3983
3985 /* 3984 /*
3986 * Iterate backward over array of thresholds starting from 3985 * Iterate backward over array of thresholds starting from
3987 * current_threshold and check if a threshold is crossed. 3986 * current_threshold and check if a threshold is crossed.
3988 * If none of thresholds below usage is crossed, we read 3987 * If none of thresholds below usage is crossed, we read
3989 * only one element of the array here. 3988 * only one element of the array here.
3990 */ 3989 */
3991 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 3990 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3992 eventfd_signal(t->entries[i].eventfd, 1); 3991 eventfd_signal(t->entries[i].eventfd, 1);
3993 3992
3994 /* i = current_threshold + 1 */ 3993 /* i = current_threshold + 1 */
3995 i++; 3994 i++;
3996 3995
3997 /* 3996 /*
3998 * Iterate forward over array of thresholds starting from 3997 * Iterate forward over array of thresholds starting from
3999 * current_threshold+1 and check if a threshold is crossed. 3998 * current_threshold+1 and check if a threshold is crossed.
4000 * If none of thresholds above usage is crossed, we read 3999 * If none of thresholds above usage is crossed, we read
4001 * only one element of the array here. 4000 * only one element of the array here.
4002 */ 4001 */
4003 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 4002 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4004 eventfd_signal(t->entries[i].eventfd, 1); 4003 eventfd_signal(t->entries[i].eventfd, 1);
4005 4004
4006 /* Update current_threshold */ 4005 /* Update current_threshold */
4007 t->current_threshold = i - 1; 4006 t->current_threshold = i - 1;
4008 unlock: 4007 unlock:
4009 rcu_read_unlock(); 4008 rcu_read_unlock();
4010 } 4009 }
4011 4010
4012 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 4011 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4013 { 4012 {
4014 while (memcg) { 4013 while (memcg) {
4015 __mem_cgroup_threshold(memcg, false); 4014 __mem_cgroup_threshold(memcg, false);
4016 if (do_swap_account) 4015 if (do_swap_account)
4017 __mem_cgroup_threshold(memcg, true); 4016 __mem_cgroup_threshold(memcg, true);
4018 4017
4019 memcg = parent_mem_cgroup(memcg); 4018 memcg = parent_mem_cgroup(memcg);
4020 } 4019 }
4021 } 4020 }
4022 4021
4023 static int compare_thresholds(const void *a, const void *b) 4022 static int compare_thresholds(const void *a, const void *b)
4024 { 4023 {
4025 const struct mem_cgroup_threshold *_a = a; 4024 const struct mem_cgroup_threshold *_a = a;
4026 const struct mem_cgroup_threshold *_b = b; 4025 const struct mem_cgroup_threshold *_b = b;
4027 4026
4028 return _a->threshold - _b->threshold; 4027 return _a->threshold - _b->threshold;
4029 } 4028 }
4030 4029
4031 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) 4030 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
4032 { 4031 {
4033 struct mem_cgroup_eventfd_list *ev; 4032 struct mem_cgroup_eventfd_list *ev;
4034 4033
4035 list_for_each_entry(ev, &mem->oom_notify, list) 4034 list_for_each_entry(ev, &mem->oom_notify, list)
4036 eventfd_signal(ev->eventfd, 1); 4035 eventfd_signal(ev->eventfd, 1);
4037 return 0; 4036 return 0;
4038 } 4037 }
4039 4038
4040 static void mem_cgroup_oom_notify(struct mem_cgroup *mem) 4039 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
4041 { 4040 {
4042 struct mem_cgroup *iter; 4041 struct mem_cgroup *iter;
4043 4042
4044 for_each_mem_cgroup_tree(iter, mem) 4043 for_each_mem_cgroup_tree(iter, mem)
4045 mem_cgroup_oom_notify_cb(iter); 4044 mem_cgroup_oom_notify_cb(iter);
4046 } 4045 }
4047 4046
4048 static int mem_cgroup_usage_register_event(struct cgroup *cgrp, 4047 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4049 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4048 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4050 { 4049 {
4051 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4050 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4052 struct mem_cgroup_thresholds *thresholds; 4051 struct mem_cgroup_thresholds *thresholds;
4053 struct mem_cgroup_threshold_ary *new; 4052 struct mem_cgroup_threshold_ary *new;
4054 int type = MEMFILE_TYPE(cft->private); 4053 int type = MEMFILE_TYPE(cft->private);
4055 u64 threshold, usage; 4054 u64 threshold, usage;
4056 int i, size, ret; 4055 int i, size, ret;
4057 4056
4058 ret = res_counter_memparse_write_strategy(args, &threshold); 4057 ret = res_counter_memparse_write_strategy(args, &threshold);
4059 if (ret) 4058 if (ret)
4060 return ret; 4059 return ret;
4061 4060
4062 mutex_lock(&memcg->thresholds_lock); 4061 mutex_lock(&memcg->thresholds_lock);
4063 4062
4064 if (type == _MEM) 4063 if (type == _MEM)
4065 thresholds = &memcg->thresholds; 4064 thresholds = &memcg->thresholds;
4066 else if (type == _MEMSWAP) 4065 else if (type == _MEMSWAP)
4067 thresholds = &memcg->memsw_thresholds; 4066 thresholds = &memcg->memsw_thresholds;
4068 else 4067 else
4069 BUG(); 4068 BUG();
4070 4069
4071 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4070 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4072 4071
4073 /* Check if a threshold crossed before adding a new one */ 4072 /* Check if a threshold crossed before adding a new one */
4074 if (thresholds->primary) 4073 if (thresholds->primary)
4075 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4074 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4076 4075
4077 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 4076 size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4078 4077
4079 /* Allocate memory for new array of thresholds */ 4078 /* Allocate memory for new array of thresholds */
4080 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 4079 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
4081 GFP_KERNEL); 4080 GFP_KERNEL);
4082 if (!new) { 4081 if (!new) {
4083 ret = -ENOMEM; 4082 ret = -ENOMEM;
4084 goto unlock; 4083 goto unlock;
4085 } 4084 }
4086 new->size = size; 4085 new->size = size;
4087 4086
4088 /* Copy thresholds (if any) to new array */ 4087 /* Copy thresholds (if any) to new array */
4089 if (thresholds->primary) { 4088 if (thresholds->primary) {
4090 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 4089 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
4091 sizeof(struct mem_cgroup_threshold)); 4090 sizeof(struct mem_cgroup_threshold));
4092 } 4091 }
4093 4092
4094 /* Add new threshold */ 4093 /* Add new threshold */
4095 new->entries[size - 1].eventfd = eventfd; 4094 new->entries[size - 1].eventfd = eventfd;
4096 new->entries[size - 1].threshold = threshold; 4095 new->entries[size - 1].threshold = threshold;
4097 4096
4098 /* Sort thresholds. Registering of new threshold isn't time-critical */ 4097 /* Sort thresholds. Registering of new threshold isn't time-critical */
4099 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 4098 sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
4100 compare_thresholds, NULL); 4099 compare_thresholds, NULL);
4101 4100
4102 /* Find current threshold */ 4101 /* Find current threshold */
4103 new->current_threshold = -1; 4102 new->current_threshold = -1;
4104 for (i = 0; i < size; i++) { 4103 for (i = 0; i < size; i++) {
4105 if (new->entries[i].threshold < usage) { 4104 if (new->entries[i].threshold < usage) {
4106 /* 4105 /*
4107 * new->current_threshold will not be used until 4106 * new->current_threshold will not be used until
4108 * rcu_assign_pointer(), so it's safe to increment 4107 * rcu_assign_pointer(), so it's safe to increment
4109 * it here. 4108 * it here.
4110 */ 4109 */
4111 ++new->current_threshold; 4110 ++new->current_threshold;
4112 } 4111 }
4113 } 4112 }
4114 4113
4115 /* Free old spare buffer and save old primary buffer as spare */ 4114 /* Free old spare buffer and save old primary buffer as spare */
4116 kfree(thresholds->spare); 4115 kfree(thresholds->spare);
4117 thresholds->spare = thresholds->primary; 4116 thresholds->spare = thresholds->primary;
4118 4117
4119 rcu_assign_pointer(thresholds->primary, new); 4118 rcu_assign_pointer(thresholds->primary, new);
4120 4119
4121 /* To be sure that nobody uses thresholds */ 4120 /* To be sure that nobody uses thresholds */
4122 synchronize_rcu(); 4121 synchronize_rcu();
4123 4122
4124 unlock: 4123 unlock:
4125 mutex_unlock(&memcg->thresholds_lock); 4124 mutex_unlock(&memcg->thresholds_lock);
4126 4125
4127 return ret; 4126 return ret;
4128 } 4127 }
4129 4128
4130 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, 4129 static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4131 struct cftype *cft, struct eventfd_ctx *eventfd) 4130 struct cftype *cft, struct eventfd_ctx *eventfd)
4132 { 4131 {
4133 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4132 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4134 struct mem_cgroup_thresholds *thresholds; 4133 struct mem_cgroup_thresholds *thresholds;
4135 struct mem_cgroup_threshold_ary *new; 4134 struct mem_cgroup_threshold_ary *new;
4136 int type = MEMFILE_TYPE(cft->private); 4135 int type = MEMFILE_TYPE(cft->private);
4137 u64 usage; 4136 u64 usage;
4138 int i, j, size; 4137 int i, j, size;
4139 4138
4140 mutex_lock(&memcg->thresholds_lock); 4139 mutex_lock(&memcg->thresholds_lock);
4141 if (type == _MEM) 4140 if (type == _MEM)
4142 thresholds = &memcg->thresholds; 4141 thresholds = &memcg->thresholds;
4143 else if (type == _MEMSWAP) 4142 else if (type == _MEMSWAP)
4144 thresholds = &memcg->memsw_thresholds; 4143 thresholds = &memcg->memsw_thresholds;
4145 else 4144 else
4146 BUG(); 4145 BUG();
4147 4146
4148 /* 4147 /*
4149 * Something went wrong if we trying to unregister a threshold 4148 * Something went wrong if we trying to unregister a threshold
4150 * if we don't have thresholds 4149 * if we don't have thresholds
4151 */ 4150 */
4152 BUG_ON(!thresholds); 4151 BUG_ON(!thresholds);
4153 4152
4154 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 4153 usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
4155 4154
4156 /* Check if a threshold crossed before removing */ 4155 /* Check if a threshold crossed before removing */
4157 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 4156 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4158 4157
4159 /* Calculate new number of threshold */ 4158 /* Calculate new number of threshold */
4160 size = 0; 4159 size = 0;
4161 for (i = 0; i < thresholds->primary->size; i++) { 4160 for (i = 0; i < thresholds->primary->size; i++) {
4162 if (thresholds->primary->entries[i].eventfd != eventfd) 4161 if (thresholds->primary->entries[i].eventfd != eventfd)
4163 size++; 4162 size++;
4164 } 4163 }
4165 4164
4166 new = thresholds->spare; 4165 new = thresholds->spare;
4167 4166
4168 /* Set thresholds array to NULL if we don't have thresholds */ 4167 /* Set thresholds array to NULL if we don't have thresholds */
4169 if (!size) { 4168 if (!size) {
4170 kfree(new); 4169 kfree(new);
4171 new = NULL; 4170 new = NULL;
4172 goto swap_buffers; 4171 goto swap_buffers;
4173 } 4172 }
4174 4173
4175 new->size = size; 4174 new->size = size;
4176 4175
4177 /* Copy thresholds and find current threshold */ 4176 /* Copy thresholds and find current threshold */
4178 new->current_threshold = -1; 4177 new->current_threshold = -1;
4179 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 4178 for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4180 if (thresholds->primary->entries[i].eventfd == eventfd) 4179 if (thresholds->primary->entries[i].eventfd == eventfd)
4181 continue; 4180 continue;
4182 4181
4183 new->entries[j] = thresholds->primary->entries[i]; 4182 new->entries[j] = thresholds->primary->entries[i];
4184 if (new->entries[j].threshold < usage) { 4183 if (new->entries[j].threshold < usage) {
4185 /* 4184 /*
4186 * new->current_threshold will not be used 4185 * new->current_threshold will not be used
4187 * until rcu_assign_pointer(), so it's safe to increment 4186 * until rcu_assign_pointer(), so it's safe to increment
4188 * it here. 4187 * it here.
4189 */ 4188 */
4190 ++new->current_threshold; 4189 ++new->current_threshold;
4191 } 4190 }
4192 j++; 4191 j++;
4193 } 4192 }
4194 4193
4195 swap_buffers: 4194 swap_buffers:
4196 /* Swap primary and spare array */ 4195 /* Swap primary and spare array */
4197 thresholds->spare = thresholds->primary; 4196 thresholds->spare = thresholds->primary;
4198 rcu_assign_pointer(thresholds->primary, new); 4197 rcu_assign_pointer(thresholds->primary, new);
4199 4198
4200 /* To be sure that nobody uses thresholds */ 4199 /* To be sure that nobody uses thresholds */
4201 synchronize_rcu(); 4200 synchronize_rcu();
4202 4201
4203 mutex_unlock(&memcg->thresholds_lock); 4202 mutex_unlock(&memcg->thresholds_lock);
4204 } 4203 }
4205 4204
4206 static int mem_cgroup_oom_register_event(struct cgroup *cgrp, 4205 static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
4207 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 4206 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
4208 { 4207 {
4209 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); 4208 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
4210 struct mem_cgroup_eventfd_list *event; 4209 struct mem_cgroup_eventfd_list *event;
4211 int type = MEMFILE_TYPE(cft->private); 4210 int type = MEMFILE_TYPE(cft->private);
4212 4211
4213 BUG_ON(type != _OOM_TYPE); 4212 BUG_ON(type != _OOM_TYPE);
4214 event = kmalloc(sizeof(*event), GFP_KERNEL); 4213 event = kmalloc(sizeof(*event), GFP_KERNEL);
4215 if (!event) 4214 if (!event)
4216 return -ENOMEM; 4215 return -ENOMEM;
4217 4216
4218 mutex_lock(&memcg_oom_mutex); 4217 mutex_lock(&memcg_oom_mutex);
4219 4218
4220 event->eventfd = eventfd; 4219 event->eventfd = eventfd;
4221 list_add(&event->list, &memcg->oom_notify); 4220 list_add(&event->list, &memcg->oom_notify);
4222 4221
4223 /* already in OOM ? */ 4222 /* already in OOM ? */
4224 if (atomic_read(&memcg->oom_lock)) 4223 if (atomic_read(&memcg->oom_lock))
4225 eventfd_signal(eventfd, 1); 4224 eventfd_signal(eventfd, 1);
4226 mutex_unlock(&memcg_oom_mutex); 4225 mutex_unlock(&memcg_oom_mutex);
4227 4226
4228 return 0; 4227 return 0;
4229 } 4228 }
4230 4229
4231 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, 4230 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
4232 struct cftype *cft, struct eventfd_ctx *eventfd) 4231 struct cftype *cft, struct eventfd_ctx *eventfd)
4233 { 4232 {
4234 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4233 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4235 struct mem_cgroup_eventfd_list *ev, *tmp; 4234 struct mem_cgroup_eventfd_list *ev, *tmp;
4236 int type = MEMFILE_TYPE(cft->private); 4235 int type = MEMFILE_TYPE(cft->private);
4237 4236
4238 BUG_ON(type != _OOM_TYPE); 4237 BUG_ON(type != _OOM_TYPE);
4239 4238
4240 mutex_lock(&memcg_oom_mutex); 4239 mutex_lock(&memcg_oom_mutex);
4241 4240
4242 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { 4241 list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
4243 if (ev->eventfd == eventfd) { 4242 if (ev->eventfd == eventfd) {
4244 list_del(&ev->list); 4243 list_del(&ev->list);
4245 kfree(ev); 4244 kfree(ev);
4246 } 4245 }
4247 } 4246 }
4248 4247
4249 mutex_unlock(&memcg_oom_mutex); 4248 mutex_unlock(&memcg_oom_mutex);
4250 } 4249 }
4251 4250
4252 static int mem_cgroup_oom_control_read(struct cgroup *cgrp, 4251 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
4253 struct cftype *cft, struct cgroup_map_cb *cb) 4252 struct cftype *cft, struct cgroup_map_cb *cb)
4254 { 4253 {
4255 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4254 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4256 4255
4257 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); 4256 cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
4258 4257
4259 if (atomic_read(&mem->oom_lock)) 4258 if (atomic_read(&mem->oom_lock))
4260 cb->fill(cb, "under_oom", 1); 4259 cb->fill(cb, "under_oom", 1);
4261 else 4260 else
4262 cb->fill(cb, "under_oom", 0); 4261 cb->fill(cb, "under_oom", 0);
4263 return 0; 4262 return 0;
4264 } 4263 }
4265 4264
4266 static int mem_cgroup_oom_control_write(struct cgroup *cgrp, 4265 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4267 struct cftype *cft, u64 val) 4266 struct cftype *cft, u64 val)
4268 { 4267 {
4269 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); 4268 struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
4270 struct mem_cgroup *parent; 4269 struct mem_cgroup *parent;
4271 4270
4272 /* cannot set to root cgroup and only 0 and 1 are allowed */ 4271 /* cannot set to root cgroup and only 0 and 1 are allowed */
4273 if (!cgrp->parent || !((val == 0) || (val == 1))) 4272 if (!cgrp->parent || !((val == 0) || (val == 1)))
4274 return -EINVAL; 4273 return -EINVAL;
4275 4274
4276 parent = mem_cgroup_from_cont(cgrp->parent); 4275 parent = mem_cgroup_from_cont(cgrp->parent);
4277 4276
4278 cgroup_lock(); 4277 cgroup_lock();
4279 /* oom-kill-disable is a flag for subhierarchy. */ 4278 /* oom-kill-disable is a flag for subhierarchy. */
4280 if ((parent->use_hierarchy) || 4279 if ((parent->use_hierarchy) ||
4281 (mem->use_hierarchy && !list_empty(&cgrp->children))) { 4280 (mem->use_hierarchy && !list_empty(&cgrp->children))) {
4282 cgroup_unlock(); 4281 cgroup_unlock();
4283 return -EINVAL; 4282 return -EINVAL;
4284 } 4283 }
4285 mem->oom_kill_disable = val; 4284 mem->oom_kill_disable = val;
4286 if (!val) 4285 if (!val)
4287 memcg_oom_recover(mem); 4286 memcg_oom_recover(mem);
4288 cgroup_unlock(); 4287 cgroup_unlock();
4289 return 0; 4288 return 0;
4290 } 4289 }
4291 4290
4292 static struct cftype mem_cgroup_files[] = { 4291 static struct cftype mem_cgroup_files[] = {
4293 { 4292 {
4294 .name = "usage_in_bytes", 4293 .name = "usage_in_bytes",
4295 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4294 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4296 .read_u64 = mem_cgroup_read, 4295 .read_u64 = mem_cgroup_read,
4297 .register_event = mem_cgroup_usage_register_event, 4296 .register_event = mem_cgroup_usage_register_event,
4298 .unregister_event = mem_cgroup_usage_unregister_event, 4297 .unregister_event = mem_cgroup_usage_unregister_event,
4299 }, 4298 },
4300 { 4299 {
4301 .name = "max_usage_in_bytes", 4300 .name = "max_usage_in_bytes",
4302 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4301 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4303 .trigger = mem_cgroup_reset, 4302 .trigger = mem_cgroup_reset,
4304 .read_u64 = mem_cgroup_read, 4303 .read_u64 = mem_cgroup_read,
4305 }, 4304 },
4306 { 4305 {
4307 .name = "limit_in_bytes", 4306 .name = "limit_in_bytes",
4308 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4307 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4309 .write_string = mem_cgroup_write, 4308 .write_string = mem_cgroup_write,
4310 .read_u64 = mem_cgroup_read, 4309 .read_u64 = mem_cgroup_read,
4311 }, 4310 },
4312 { 4311 {
4313 .name = "soft_limit_in_bytes", 4312 .name = "soft_limit_in_bytes",
4314 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4313 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4315 .write_string = mem_cgroup_write, 4314 .write_string = mem_cgroup_write,
4316 .read_u64 = mem_cgroup_read, 4315 .read_u64 = mem_cgroup_read,
4317 }, 4316 },
4318 { 4317 {
4319 .name = "failcnt", 4318 .name = "failcnt",
4320 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4319 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4321 .trigger = mem_cgroup_reset, 4320 .trigger = mem_cgroup_reset,
4322 .read_u64 = mem_cgroup_read, 4321 .read_u64 = mem_cgroup_read,
4323 }, 4322 },
4324 { 4323 {
4325 .name = "stat", 4324 .name = "stat",
4326 .read_map = mem_control_stat_show, 4325 .read_map = mem_control_stat_show,
4327 }, 4326 },
4328 { 4327 {
4329 .name = "force_empty", 4328 .name = "force_empty",
4330 .trigger = mem_cgroup_force_empty_write, 4329 .trigger = mem_cgroup_force_empty_write,
4331 }, 4330 },
4332 { 4331 {
4333 .name = "use_hierarchy", 4332 .name = "use_hierarchy",
4334 .write_u64 = mem_cgroup_hierarchy_write, 4333 .write_u64 = mem_cgroup_hierarchy_write,
4335 .read_u64 = mem_cgroup_hierarchy_read, 4334 .read_u64 = mem_cgroup_hierarchy_read,
4336 }, 4335 },
4337 { 4336 {
4338 .name = "swappiness", 4337 .name = "swappiness",
4339 .read_u64 = mem_cgroup_swappiness_read, 4338 .read_u64 = mem_cgroup_swappiness_read,
4340 .write_u64 = mem_cgroup_swappiness_write, 4339 .write_u64 = mem_cgroup_swappiness_write,
4341 }, 4340 },
4342 { 4341 {
4343 .name = "move_charge_at_immigrate", 4342 .name = "move_charge_at_immigrate",
4344 .read_u64 = mem_cgroup_move_charge_read, 4343 .read_u64 = mem_cgroup_move_charge_read,
4345 .write_u64 = mem_cgroup_move_charge_write, 4344 .write_u64 = mem_cgroup_move_charge_write,
4346 }, 4345 },
4347 { 4346 {
4348 .name = "oom_control", 4347 .name = "oom_control",
4349 .read_map = mem_cgroup_oom_control_read, 4348 .read_map = mem_cgroup_oom_control_read,
4350 .write_u64 = mem_cgroup_oom_control_write, 4349 .write_u64 = mem_cgroup_oom_control_write,
4351 .register_event = mem_cgroup_oom_register_event, 4350 .register_event = mem_cgroup_oom_register_event,
4352 .unregister_event = mem_cgroup_oom_unregister_event, 4351 .unregister_event = mem_cgroup_oom_unregister_event,
4353 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 4352 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4354 }, 4353 },
4355 }; 4354 };
4356 4355
4357 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4356 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4358 static struct cftype memsw_cgroup_files[] = { 4357 static struct cftype memsw_cgroup_files[] = {
4359 { 4358 {
4360 .name = "memsw.usage_in_bytes", 4359 .name = "memsw.usage_in_bytes",
4361 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4360 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4362 .read_u64 = mem_cgroup_read, 4361 .read_u64 = mem_cgroup_read,
4363 .register_event = mem_cgroup_usage_register_event, 4362 .register_event = mem_cgroup_usage_register_event,
4364 .unregister_event = mem_cgroup_usage_unregister_event, 4363 .unregister_event = mem_cgroup_usage_unregister_event,
4365 }, 4364 },
4366 { 4365 {
4367 .name = "memsw.max_usage_in_bytes", 4366 .name = "memsw.max_usage_in_bytes",
4368 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4367 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4369 .trigger = mem_cgroup_reset, 4368 .trigger = mem_cgroup_reset,
4370 .read_u64 = mem_cgroup_read, 4369 .read_u64 = mem_cgroup_read,
4371 }, 4370 },
4372 { 4371 {
4373 .name = "memsw.limit_in_bytes", 4372 .name = "memsw.limit_in_bytes",
4374 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4373 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4375 .write_string = mem_cgroup_write, 4374 .write_string = mem_cgroup_write,
4376 .read_u64 = mem_cgroup_read, 4375 .read_u64 = mem_cgroup_read,
4377 }, 4376 },
4378 { 4377 {
4379 .name = "memsw.failcnt", 4378 .name = "memsw.failcnt",
4380 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4379 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4381 .trigger = mem_cgroup_reset, 4380 .trigger = mem_cgroup_reset,
4382 .read_u64 = mem_cgroup_read, 4381 .read_u64 = mem_cgroup_read,
4383 }, 4382 },
4384 }; 4383 };
4385 4384
4386 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4385 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4387 { 4386 {
4388 if (!do_swap_account) 4387 if (!do_swap_account)
4389 return 0; 4388 return 0;
4390 return cgroup_add_files(cont, ss, memsw_cgroup_files, 4389 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4391 ARRAY_SIZE(memsw_cgroup_files)); 4390 ARRAY_SIZE(memsw_cgroup_files));
4392 }; 4391 };
4393 #else 4392 #else
4394 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) 4393 static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4395 { 4394 {
4396 return 0; 4395 return 0;
4397 } 4396 }
4398 #endif 4397 #endif
4399 4398
4400 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4399 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4401 { 4400 {
4402 struct mem_cgroup_per_node *pn; 4401 struct mem_cgroup_per_node *pn;
4403 struct mem_cgroup_per_zone *mz; 4402 struct mem_cgroup_per_zone *mz;
4404 enum lru_list l; 4403 enum lru_list l;
4405 int zone, tmp = node; 4404 int zone, tmp = node;
4406 /* 4405 /*
4407 * This routine is called against possible nodes. 4406 * This routine is called against possible nodes.
4408 * But it's BUG to call kmalloc() against offline node. 4407 * But it's BUG to call kmalloc() against offline node.
4409 * 4408 *
4410 * TODO: this routine can waste much memory for nodes which will 4409 * TODO: this routine can waste much memory for nodes which will
4411 * never be onlined. It's better to use memory hotplug callback 4410 * never be onlined. It's better to use memory hotplug callback
4412 * function. 4411 * function.
4413 */ 4412 */
4414 if (!node_state(node, N_NORMAL_MEMORY)) 4413 if (!node_state(node, N_NORMAL_MEMORY))
4415 tmp = -1; 4414 tmp = -1;
4416 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 4415 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4417 if (!pn) 4416 if (!pn)
4418 return 1; 4417 return 1;
4419 4418
4420 mem->info.nodeinfo[node] = pn; 4419 mem->info.nodeinfo[node] = pn;
4421 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4420 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4422 mz = &pn->zoneinfo[zone]; 4421 mz = &pn->zoneinfo[zone];
4423 for_each_lru(l) 4422 for_each_lru(l)
4424 INIT_LIST_HEAD(&mz->lists[l]); 4423 INIT_LIST_HEAD(&mz->lists[l]);
4425 mz->usage_in_excess = 0; 4424 mz->usage_in_excess = 0;
4426 mz->on_tree = false; 4425 mz->on_tree = false;
4427 mz->mem = mem; 4426 mz->mem = mem;
4428 } 4427 }
4429 return 0; 4428 return 0;
4430 } 4429 }
4431 4430
4432 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 4431 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
4433 { 4432 {
4434 kfree(mem->info.nodeinfo[node]); 4433 kfree(mem->info.nodeinfo[node]);
4435 } 4434 }
4436 4435
4437 static struct mem_cgroup *mem_cgroup_alloc(void) 4436 static struct mem_cgroup *mem_cgroup_alloc(void)
4438 { 4437 {
4439 struct mem_cgroup *mem; 4438 struct mem_cgroup *mem;
4440 int size = sizeof(struct mem_cgroup); 4439 int size = sizeof(struct mem_cgroup);
4441 4440
4442 /* Can be very big if MAX_NUMNODES is very big */ 4441 /* Can be very big if MAX_NUMNODES is very big */
4443 if (size < PAGE_SIZE) 4442 if (size < PAGE_SIZE)
4444 mem = kzalloc(size, GFP_KERNEL); 4443 mem = kzalloc(size, GFP_KERNEL);
4445 else 4444 else
4446 mem = vzalloc(size); 4445 mem = vzalloc(size);
4447 4446
4448 if (!mem) 4447 if (!mem)
4449 return NULL; 4448 return NULL;
4450 4449
4451 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 4450 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4452 if (!mem->stat) 4451 if (!mem->stat)
4453 goto out_free; 4452 goto out_free;
4454 spin_lock_init(&mem->pcp_counter_lock); 4453 spin_lock_init(&mem->pcp_counter_lock);
4455 return mem; 4454 return mem;
4456 4455
4457 out_free: 4456 out_free:
4458 if (size < PAGE_SIZE) 4457 if (size < PAGE_SIZE)
4459 kfree(mem); 4458 kfree(mem);
4460 else 4459 else
4461 vfree(mem); 4460 vfree(mem);
4462 return NULL; 4461 return NULL;
4463 } 4462 }
4464 4463
4465 /* 4464 /*
4466 * At destroying mem_cgroup, references from swap_cgroup can remain. 4465 * At destroying mem_cgroup, references from swap_cgroup can remain.
4467 * (scanning all at force_empty is too costly...) 4466 * (scanning all at force_empty is too costly...)
4468 * 4467 *
4469 * Instead of clearing all references at force_empty, we remember 4468 * Instead of clearing all references at force_empty, we remember
4470 * the number of reference from swap_cgroup and free mem_cgroup when 4469 * the number of reference from swap_cgroup and free mem_cgroup when
4471 * it goes down to 0. 4470 * it goes down to 0.
4472 * 4471 *
4473 * Removal of cgroup itself succeeds regardless of refs from swap. 4472 * Removal of cgroup itself succeeds regardless of refs from swap.
4474 */ 4473 */
4475 4474
4476 static void __mem_cgroup_free(struct mem_cgroup *mem) 4475 static void __mem_cgroup_free(struct mem_cgroup *mem)
4477 { 4476 {
4478 int node; 4477 int node;
4479 4478
4480 mem_cgroup_remove_from_trees(mem); 4479 mem_cgroup_remove_from_trees(mem);
4481 free_css_id(&mem_cgroup_subsys, &mem->css); 4480 free_css_id(&mem_cgroup_subsys, &mem->css);
4482 4481
4483 for_each_node_state(node, N_POSSIBLE) 4482 for_each_node_state(node, N_POSSIBLE)
4484 free_mem_cgroup_per_zone_info(mem, node); 4483 free_mem_cgroup_per_zone_info(mem, node);
4485 4484
4486 free_percpu(mem->stat); 4485 free_percpu(mem->stat);
4487 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4486 if (sizeof(struct mem_cgroup) < PAGE_SIZE)
4488 kfree(mem); 4487 kfree(mem);
4489 else 4488 else
4490 vfree(mem); 4489 vfree(mem);
4491 } 4490 }
4492 4491
4493 static void mem_cgroup_get(struct mem_cgroup *mem) 4492 static void mem_cgroup_get(struct mem_cgroup *mem)
4494 { 4493 {
4495 atomic_inc(&mem->refcnt); 4494 atomic_inc(&mem->refcnt);
4496 } 4495 }
4497 4496
4498 static void __mem_cgroup_put(struct mem_cgroup *mem, int count) 4497 static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
4499 { 4498 {
4500 if (atomic_sub_and_test(count, &mem->refcnt)) { 4499 if (atomic_sub_and_test(count, &mem->refcnt)) {
4501 struct mem_cgroup *parent = parent_mem_cgroup(mem); 4500 struct mem_cgroup *parent = parent_mem_cgroup(mem);
4502 __mem_cgroup_free(mem); 4501 __mem_cgroup_free(mem);
4503 if (parent) 4502 if (parent)
4504 mem_cgroup_put(parent); 4503 mem_cgroup_put(parent);
4505 } 4504 }
4506 } 4505 }
4507 4506
4508 static void mem_cgroup_put(struct mem_cgroup *mem) 4507 static void mem_cgroup_put(struct mem_cgroup *mem)
4509 { 4508 {
4510 __mem_cgroup_put(mem, 1); 4509 __mem_cgroup_put(mem, 1);
4511 } 4510 }
4512 4511
4513 /* 4512 /*
4514 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 4513 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4515 */ 4514 */
4516 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) 4515 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
4517 { 4516 {
4518 if (!mem->res.parent) 4517 if (!mem->res.parent)
4519 return NULL; 4518 return NULL;
4520 return mem_cgroup_from_res_counter(mem->res.parent, res); 4519 return mem_cgroup_from_res_counter(mem->res.parent, res);
4521 } 4520 }
4522 4521
4523 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4522 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4524 static void __init enable_swap_cgroup(void) 4523 static void __init enable_swap_cgroup(void)
4525 { 4524 {
4526 if (!mem_cgroup_disabled() && really_do_swap_account) 4525 if (!mem_cgroup_disabled() && really_do_swap_account)
4527 do_swap_account = 1; 4526 do_swap_account = 1;
4528 } 4527 }
4529 #else 4528 #else
4530 static void __init enable_swap_cgroup(void) 4529 static void __init enable_swap_cgroup(void)
4531 { 4530 {
4532 } 4531 }
4533 #endif 4532 #endif
4534 4533
4535 static int mem_cgroup_soft_limit_tree_init(void) 4534 static int mem_cgroup_soft_limit_tree_init(void)
4536 { 4535 {
4537 struct mem_cgroup_tree_per_node *rtpn; 4536 struct mem_cgroup_tree_per_node *rtpn;
4538 struct mem_cgroup_tree_per_zone *rtpz; 4537 struct mem_cgroup_tree_per_zone *rtpz;
4539 int tmp, node, zone; 4538 int tmp, node, zone;
4540 4539
4541 for_each_node_state(node, N_POSSIBLE) { 4540 for_each_node_state(node, N_POSSIBLE) {
4542 tmp = node; 4541 tmp = node;
4543 if (!node_state(node, N_NORMAL_MEMORY)) 4542 if (!node_state(node, N_NORMAL_MEMORY))
4544 tmp = -1; 4543 tmp = -1;
4545 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 4544 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
4546 if (!rtpn) 4545 if (!rtpn)
4547 return 1; 4546 return 1;
4548 4547
4549 soft_limit_tree.rb_tree_per_node[node] = rtpn; 4548 soft_limit_tree.rb_tree_per_node[node] = rtpn;
4550 4549
4551 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4550 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4552 rtpz = &rtpn->rb_tree_per_zone[zone]; 4551 rtpz = &rtpn->rb_tree_per_zone[zone];
4553 rtpz->rb_root = RB_ROOT; 4552 rtpz->rb_root = RB_ROOT;
4554 spin_lock_init(&rtpz->lock); 4553 spin_lock_init(&rtpz->lock);
4555 } 4554 }
4556 } 4555 }
4557 return 0; 4556 return 0;
4558 } 4557 }
4559 4558
4560 static struct cgroup_subsys_state * __ref 4559 static struct cgroup_subsys_state * __ref
4561 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 4560 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4562 { 4561 {
4563 struct mem_cgroup *mem, *parent; 4562 struct mem_cgroup *mem, *parent;
4564 long error = -ENOMEM; 4563 long error = -ENOMEM;
4565 int node; 4564 int node;
4566 4565
4567 mem = mem_cgroup_alloc(); 4566 mem = mem_cgroup_alloc();
4568 if (!mem) 4567 if (!mem)
4569 return ERR_PTR(error); 4568 return ERR_PTR(error);
4570 4569
4571 for_each_node_state(node, N_POSSIBLE) 4570 for_each_node_state(node, N_POSSIBLE)
4572 if (alloc_mem_cgroup_per_zone_info(mem, node)) 4571 if (alloc_mem_cgroup_per_zone_info(mem, node))
4573 goto free_out; 4572 goto free_out;
4574 4573
4575 /* root ? */ 4574 /* root ? */
4576 if (cont->parent == NULL) { 4575 if (cont->parent == NULL) {
4577 int cpu; 4576 int cpu;
4578 enable_swap_cgroup(); 4577 enable_swap_cgroup();
4579 parent = NULL; 4578 parent = NULL;
4580 root_mem_cgroup = mem; 4579 root_mem_cgroup = mem;
4581 if (mem_cgroup_soft_limit_tree_init()) 4580 if (mem_cgroup_soft_limit_tree_init())
4582 goto free_out; 4581 goto free_out;
4583 for_each_possible_cpu(cpu) { 4582 for_each_possible_cpu(cpu) {
4584 struct memcg_stock_pcp *stock = 4583 struct memcg_stock_pcp *stock =
4585 &per_cpu(memcg_stock, cpu); 4584 &per_cpu(memcg_stock, cpu);
4586 INIT_WORK(&stock->work, drain_local_stock); 4585 INIT_WORK(&stock->work, drain_local_stock);
4587 } 4586 }
4588 hotcpu_notifier(memcg_cpu_hotplug_callback, 0); 4587 hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
4589 } else { 4588 } else {
4590 parent = mem_cgroup_from_cont(cont->parent); 4589 parent = mem_cgroup_from_cont(cont->parent);
4591 mem->use_hierarchy = parent->use_hierarchy; 4590 mem->use_hierarchy = parent->use_hierarchy;
4592 mem->oom_kill_disable = parent->oom_kill_disable; 4591 mem->oom_kill_disable = parent->oom_kill_disable;
4593 } 4592 }
4594 4593
4595 if (parent && parent->use_hierarchy) { 4594 if (parent && parent->use_hierarchy) {
4596 res_counter_init(&mem->res, &parent->res); 4595 res_counter_init(&mem->res, &parent->res);
4597 res_counter_init(&mem->memsw, &parent->memsw); 4596 res_counter_init(&mem->memsw, &parent->memsw);
4598 /* 4597 /*
4599 * We increment refcnt of the parent to ensure that we can 4598 * We increment refcnt of the parent to ensure that we can
4600 * safely access it on res_counter_charge/uncharge. 4599 * safely access it on res_counter_charge/uncharge.
4601 * This refcnt will be decremented when freeing this 4600 * This refcnt will be decremented when freeing this
4602 * mem_cgroup(see mem_cgroup_put). 4601 * mem_cgroup(see mem_cgroup_put).
4603 */ 4602 */
4604 mem_cgroup_get(parent); 4603 mem_cgroup_get(parent);
4605 } else { 4604 } else {
4606 res_counter_init(&mem->res, NULL); 4605 res_counter_init(&mem->res, NULL);
4607 res_counter_init(&mem->memsw, NULL); 4606 res_counter_init(&mem->memsw, NULL);
4608 } 4607 }
4609 mem->last_scanned_child = 0; 4608 mem->last_scanned_child = 0;
4610 INIT_LIST_HEAD(&mem->oom_notify); 4609 INIT_LIST_HEAD(&mem->oom_notify);
4611 4610
4612 if (parent) 4611 if (parent)
4613 mem->swappiness = get_swappiness(parent); 4612 mem->swappiness = get_swappiness(parent);
4614 atomic_set(&mem->refcnt, 1); 4613 atomic_set(&mem->refcnt, 1);
4615 mem->move_charge_at_immigrate = 0; 4614 mem->move_charge_at_immigrate = 0;
4616 mutex_init(&mem->thresholds_lock); 4615 mutex_init(&mem->thresholds_lock);
4617 return &mem->css; 4616 return &mem->css;
4618 free_out: 4617 free_out:
4619 __mem_cgroup_free(mem); 4618 __mem_cgroup_free(mem);
4620 root_mem_cgroup = NULL; 4619 root_mem_cgroup = NULL;
4621 return ERR_PTR(error); 4620 return ERR_PTR(error);
4622 } 4621 }
4623 4622
4624 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 4623 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
4625 struct cgroup *cont) 4624 struct cgroup *cont)
4626 { 4625 {
4627 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4626 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4628 4627
4629 return mem_cgroup_force_empty(mem, false); 4628 return mem_cgroup_force_empty(mem, false);
4630 } 4629 }
4631 4630
4632 static void mem_cgroup_destroy(struct cgroup_subsys *ss, 4631 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
4633 struct cgroup *cont) 4632 struct cgroup *cont)
4634 { 4633 {
4635 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 4634 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
4636 4635
4637 mem_cgroup_put(mem); 4636 mem_cgroup_put(mem);
4638 } 4637 }
4639 4638
4640 static int mem_cgroup_populate(struct cgroup_subsys *ss, 4639 static int mem_cgroup_populate(struct cgroup_subsys *ss,
4641 struct cgroup *cont) 4640 struct cgroup *cont)
4642 { 4641 {
4643 int ret; 4642 int ret;
4644 4643
4645 ret = cgroup_add_files(cont, ss, mem_cgroup_files, 4644 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
4646 ARRAY_SIZE(mem_cgroup_files)); 4645 ARRAY_SIZE(mem_cgroup_files));
4647 4646
4648 if (!ret) 4647 if (!ret)
4649 ret = register_memsw_files(cont, ss); 4648 ret = register_memsw_files(cont, ss);
4650 return ret; 4649 return ret;
4651 } 4650 }
4652 4651
4653 #ifdef CONFIG_MMU 4652 #ifdef CONFIG_MMU
4654 /* Handlers for move charge at task migration. */ 4653 /* Handlers for move charge at task migration. */
4655 #define PRECHARGE_COUNT_AT_ONCE 256 4654 #define PRECHARGE_COUNT_AT_ONCE 256
4656 static int mem_cgroup_do_precharge(unsigned long count) 4655 static int mem_cgroup_do_precharge(unsigned long count)
4657 { 4656 {
4658 int ret = 0; 4657 int ret = 0;
4659 int batch_count = PRECHARGE_COUNT_AT_ONCE; 4658 int batch_count = PRECHARGE_COUNT_AT_ONCE;
4660 struct mem_cgroup *mem = mc.to; 4659 struct mem_cgroup *mem = mc.to;
4661 4660
4662 if (mem_cgroup_is_root(mem)) { 4661 if (mem_cgroup_is_root(mem)) {
4663 mc.precharge += count; 4662 mc.precharge += count;
4664 /* we don't need css_get for root */ 4663 /* we don't need css_get for root */
4665 return ret; 4664 return ret;
4666 } 4665 }
4667 /* try to charge at once */ 4666 /* try to charge at once */
4668 if (count > 1) { 4667 if (count > 1) {
4669 struct res_counter *dummy; 4668 struct res_counter *dummy;
4670 /* 4669 /*
4671 * "mem" cannot be under rmdir() because we've already checked 4670 * "mem" cannot be under rmdir() because we've already checked
4672 * by cgroup_lock_live_cgroup() that it is not removed and we 4671 * by cgroup_lock_live_cgroup() that it is not removed and we
4673 * are still under the same cgroup_mutex. So we can postpone 4672 * are still under the same cgroup_mutex. So we can postpone
4674 * css_get(). 4673 * css_get().
4675 */ 4674 */
4676 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) 4675 if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
4677 goto one_by_one; 4676 goto one_by_one;
4678 if (do_swap_account && res_counter_charge(&mem->memsw, 4677 if (do_swap_account && res_counter_charge(&mem->memsw,
4679 PAGE_SIZE * count, &dummy)) { 4678 PAGE_SIZE * count, &dummy)) {
4680 res_counter_uncharge(&mem->res, PAGE_SIZE * count); 4679 res_counter_uncharge(&mem->res, PAGE_SIZE * count);
4681 goto one_by_one; 4680 goto one_by_one;
4682 } 4681 }
4683 mc.precharge += count; 4682 mc.precharge += count;
4684 return ret; 4683 return ret;
4685 } 4684 }
4686 one_by_one: 4685 one_by_one:
4687 /* fall back to one by one charge */ 4686 /* fall back to one by one charge */
4688 while (count--) { 4687 while (count--) {
4689 if (signal_pending(current)) { 4688 if (signal_pending(current)) {
4690 ret = -EINTR; 4689 ret = -EINTR;
4691 break; 4690 break;
4692 } 4691 }
4693 if (!batch_count--) { 4692 if (!batch_count--) {
4694 batch_count = PRECHARGE_COUNT_AT_ONCE; 4693 batch_count = PRECHARGE_COUNT_AT_ONCE;
4695 cond_resched(); 4694 cond_resched();
4696 } 4695 }
4697 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); 4696 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
4698 if (ret || !mem) 4697 if (ret || !mem)
4699 /* mem_cgroup_clear_mc() will do uncharge later */ 4698 /* mem_cgroup_clear_mc() will do uncharge later */
4700 return -ENOMEM; 4699 return -ENOMEM;
4701 mc.precharge++; 4700 mc.precharge++;
4702 } 4701 }
4703 return ret; 4702 return ret;
4704 } 4703 }
4705 4704
4706 /** 4705 /**
4707 * is_target_pte_for_mc - check a pte whether it is valid for move charge 4706 * is_target_pte_for_mc - check a pte whether it is valid for move charge
4708 * @vma: the vma the pte to be checked belongs 4707 * @vma: the vma the pte to be checked belongs
4709 * @addr: the address corresponding to the pte to be checked 4708 * @addr: the address corresponding to the pte to be checked
4710 * @ptent: the pte to be checked 4709 * @ptent: the pte to be checked
4711 * @target: the pointer the target page or swap ent will be stored(can be NULL) 4710 * @target: the pointer the target page or swap ent will be stored(can be NULL)
4712 * 4711 *
4713 * Returns 4712 * Returns
4714 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 4713 * 0(MC_TARGET_NONE): if the pte is not a target for move charge.
4715 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 4714 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4716 * move charge. if @target is not NULL, the page is stored in target->page 4715 * move charge. if @target is not NULL, the page is stored in target->page
4717 * with extra refcnt got(Callers should handle it). 4716 * with extra refcnt got(Callers should handle it).
4718 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 4717 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4719 * target for charge migration. if @target is not NULL, the entry is stored 4718 * target for charge migration. if @target is not NULL, the entry is stored
4720 * in target->ent. 4719 * in target->ent.
4721 * 4720 *
4722 * Called with pte lock held. 4721 * Called with pte lock held.
4723 */ 4722 */
4724 union mc_target { 4723 union mc_target {
4725 struct page *page; 4724 struct page *page;
4726 swp_entry_t ent; 4725 swp_entry_t ent;
4727 }; 4726 };
4728 4727
4729 enum mc_target_type { 4728 enum mc_target_type {
4730 MC_TARGET_NONE, /* not used */ 4729 MC_TARGET_NONE, /* not used */
4731 MC_TARGET_PAGE, 4730 MC_TARGET_PAGE,
4732 MC_TARGET_SWAP, 4731 MC_TARGET_SWAP,
4733 }; 4732 };
4734 4733
4735 static struct page *mc_handle_present_pte(struct vm_area_struct *vma, 4734 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4736 unsigned long addr, pte_t ptent) 4735 unsigned long addr, pte_t ptent)
4737 { 4736 {
4738 struct page *page = vm_normal_page(vma, addr, ptent); 4737 struct page *page = vm_normal_page(vma, addr, ptent);
4739 4738
4740 if (!page || !page_mapped(page)) 4739 if (!page || !page_mapped(page))
4741 return NULL; 4740 return NULL;
4742 if (PageAnon(page)) { 4741 if (PageAnon(page)) {
4743 /* we don't move shared anon */ 4742 /* we don't move shared anon */
4744 if (!move_anon() || page_mapcount(page) > 2) 4743 if (!move_anon() || page_mapcount(page) > 2)
4745 return NULL; 4744 return NULL;
4746 } else if (!move_file()) 4745 } else if (!move_file())
4747 /* we ignore mapcount for file pages */ 4746 /* we ignore mapcount for file pages */
4748 return NULL; 4747 return NULL;
4749 if (!get_page_unless_zero(page)) 4748 if (!get_page_unless_zero(page))
4750 return NULL; 4749 return NULL;
4751 4750
4752 return page; 4751 return page;
4753 } 4752 }
4754 4753
4755 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 4754 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4756 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4755 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4757 { 4756 {
4758 int usage_count; 4757 int usage_count;
4759 struct page *page = NULL; 4758 struct page *page = NULL;
4760 swp_entry_t ent = pte_to_swp_entry(ptent); 4759 swp_entry_t ent = pte_to_swp_entry(ptent);
4761 4760
4762 if (!move_anon() || non_swap_entry(ent)) 4761 if (!move_anon() || non_swap_entry(ent))
4763 return NULL; 4762 return NULL;
4764 usage_count = mem_cgroup_count_swap_user(ent, &page); 4763 usage_count = mem_cgroup_count_swap_user(ent, &page);
4765 if (usage_count > 1) { /* we don't move shared anon */ 4764 if (usage_count > 1) { /* we don't move shared anon */
4766 if (page) 4765 if (page)
4767 put_page(page); 4766 put_page(page);
4768 return NULL; 4767 return NULL;
4769 } 4768 }
4770 if (do_swap_account) 4769 if (do_swap_account)
4771 entry->val = ent.val; 4770 entry->val = ent.val;
4772 4771
4773 return page; 4772 return page;
4774 } 4773 }
4775 4774
4776 static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 4775 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4777 unsigned long addr, pte_t ptent, swp_entry_t *entry) 4776 unsigned long addr, pte_t ptent, swp_entry_t *entry)
4778 { 4777 {
4779 struct page *page = NULL; 4778 struct page *page = NULL;
4780 struct inode *inode; 4779 struct inode *inode;
4781 struct address_space *mapping; 4780 struct address_space *mapping;
4782 pgoff_t pgoff; 4781 pgoff_t pgoff;
4783 4782
4784 if (!vma->vm_file) /* anonymous vma */ 4783 if (!vma->vm_file) /* anonymous vma */
4785 return NULL; 4784 return NULL;
4786 if (!move_file()) 4785 if (!move_file())
4787 return NULL; 4786 return NULL;
4788 4787
4789 inode = vma->vm_file->f_path.dentry->d_inode; 4788 inode = vma->vm_file->f_path.dentry->d_inode;
4790 mapping = vma->vm_file->f_mapping; 4789 mapping = vma->vm_file->f_mapping;
4791 if (pte_none(ptent)) 4790 if (pte_none(ptent))
4792 pgoff = linear_page_index(vma, addr); 4791 pgoff = linear_page_index(vma, addr);
4793 else /* pte_file(ptent) is true */ 4792 else /* pte_file(ptent) is true */
4794 pgoff = pte_to_pgoff(ptent); 4793 pgoff = pte_to_pgoff(ptent);
4795 4794
4796 /* page is moved even if it's not RSS of this task(page-faulted). */ 4795 /* page is moved even if it's not RSS of this task(page-faulted). */
4797 if (!mapping_cap_swap_backed(mapping)) { /* normal file */ 4796 if (!mapping_cap_swap_backed(mapping)) { /* normal file */
4798 page = find_get_page(mapping, pgoff); 4797 page = find_get_page(mapping, pgoff);
4799 } else { /* shmem/tmpfs file. we should take account of swap too. */ 4798 } else { /* shmem/tmpfs file. we should take account of swap too. */
4800 swp_entry_t ent; 4799 swp_entry_t ent;
4801 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); 4800 mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
4802 if (do_swap_account) 4801 if (do_swap_account)
4803 entry->val = ent.val; 4802 entry->val = ent.val;
4804 } 4803 }
4805 4804
4806 return page; 4805 return page;
4807 } 4806 }
4808 4807
4809 static int is_target_pte_for_mc(struct vm_area_struct *vma, 4808 static int is_target_pte_for_mc(struct vm_area_struct *vma,
4810 unsigned long addr, pte_t ptent, union mc_target *target) 4809 unsigned long addr, pte_t ptent, union mc_target *target)
4811 { 4810 {
4812 struct page *page = NULL; 4811 struct page *page = NULL;
4813 struct page_cgroup *pc; 4812 struct page_cgroup *pc;
4814 int ret = 0; 4813 int ret = 0;
4815 swp_entry_t ent = { .val = 0 }; 4814 swp_entry_t ent = { .val = 0 };
4816 4815
4817 if (pte_present(ptent)) 4816 if (pte_present(ptent))
4818 page = mc_handle_present_pte(vma, addr, ptent); 4817 page = mc_handle_present_pte(vma, addr, ptent);
4819 else if (is_swap_pte(ptent)) 4818 else if (is_swap_pte(ptent))
4820 page = mc_handle_swap_pte(vma, addr, ptent, &ent); 4819 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4821 else if (pte_none(ptent) || pte_file(ptent)) 4820 else if (pte_none(ptent) || pte_file(ptent))
4822 page = mc_handle_file_pte(vma, addr, ptent, &ent); 4821 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4823 4822
4824 if (!page && !ent.val) 4823 if (!page && !ent.val)
4825 return 0; 4824 return 0;
4826 if (page) { 4825 if (page) {
4827 pc = lookup_page_cgroup(page); 4826 pc = lookup_page_cgroup(page);
4828 /* 4827 /*
4829 * Do only loose check w/o page_cgroup lock. 4828 * Do only loose check w/o page_cgroup lock.
4830 * mem_cgroup_move_account() checks the pc is valid or not under 4829 * mem_cgroup_move_account() checks the pc is valid or not under
4831 * the lock. 4830 * the lock.
4832 */ 4831 */
4833 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { 4832 if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
4834 ret = MC_TARGET_PAGE; 4833 ret = MC_TARGET_PAGE;
4835 if (target) 4834 if (target)
4836 target->page = page; 4835 target->page = page;
4837 } 4836 }
4838 if (!ret || !target) 4837 if (!ret || !target)
4839 put_page(page); 4838 put_page(page);
4840 } 4839 }
4841 /* There is a swap entry and a page doesn't exist or isn't charged */ 4840 /* There is a swap entry and a page doesn't exist or isn't charged */
4842 if (ent.val && !ret && 4841 if (ent.val && !ret &&
4843 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { 4842 css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
4844 ret = MC_TARGET_SWAP; 4843 ret = MC_TARGET_SWAP;
4845 if (target) 4844 if (target)
4846 target->ent = ent; 4845 target->ent = ent;
4847 } 4846 }
4848 return ret; 4847 return ret;
4849 } 4848 }
4850 4849
4851 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, 4850 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4852 unsigned long addr, unsigned long end, 4851 unsigned long addr, unsigned long end,
4853 struct mm_walk *walk) 4852 struct mm_walk *walk)
4854 { 4853 {
4855 struct vm_area_struct *vma = walk->private; 4854 struct vm_area_struct *vma = walk->private;
4856 pte_t *pte; 4855 pte_t *pte;
4857 spinlock_t *ptl; 4856 spinlock_t *ptl;
4858 4857
4859 split_huge_page_pmd(walk->mm, pmd); 4858 split_huge_page_pmd(walk->mm, pmd);
4860 4859
4861 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 4860 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4862 for (; addr != end; pte++, addr += PAGE_SIZE) 4861 for (; addr != end; pte++, addr += PAGE_SIZE)
4863 if (is_target_pte_for_mc(vma, addr, *pte, NULL)) 4862 if (is_target_pte_for_mc(vma, addr, *pte, NULL))
4864 mc.precharge++; /* increment precharge temporarily */ 4863 mc.precharge++; /* increment precharge temporarily */
4865 pte_unmap_unlock(pte - 1, ptl); 4864 pte_unmap_unlock(pte - 1, ptl);
4866 cond_resched(); 4865 cond_resched();
4867 4866
4868 return 0; 4867 return 0;
4869 } 4868 }
4870 4869
4871 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) 4870 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4872 { 4871 {
4873 unsigned long precharge; 4872 unsigned long precharge;
4874 struct vm_area_struct *vma; 4873 struct vm_area_struct *vma;
4875 4874
4876 down_read(&mm->mmap_sem); 4875 down_read(&mm->mmap_sem);
4877 for (vma = mm->mmap; vma; vma = vma->vm_next) { 4876 for (vma = mm->mmap; vma; vma = vma->vm_next) {
4878 struct mm_walk mem_cgroup_count_precharge_walk = { 4877 struct mm_walk mem_cgroup_count_precharge_walk = {
4879 .pmd_entry = mem_cgroup_count_precharge_pte_range, 4878 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4880 .mm = mm, 4879 .mm = mm,
4881 .private = vma, 4880 .private = vma,
4882 }; 4881 };
4883 if (is_vm_hugetlb_page(vma)) 4882 if (is_vm_hugetlb_page(vma))
4884 continue; 4883 continue;
4885 walk_page_range(vma->vm_start, vma->vm_end, 4884 walk_page_range(vma->vm_start, vma->vm_end,
4886 &mem_cgroup_count_precharge_walk); 4885 &mem_cgroup_count_precharge_walk);
4887 } 4886 }
4888 up_read(&mm->mmap_sem); 4887 up_read(&mm->mmap_sem);
4889 4888
4890 precharge = mc.precharge; 4889 precharge = mc.precharge;
4891 mc.precharge = 0; 4890 mc.precharge = 0;
4892 4891
4893 return precharge; 4892 return precharge;
4894 } 4893 }
4895 4894
4896 static int mem_cgroup_precharge_mc(struct mm_struct *mm) 4895 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4897 { 4896 {
4898 unsigned long precharge = mem_cgroup_count_precharge(mm); 4897 unsigned long precharge = mem_cgroup_count_precharge(mm);
4899 4898
4900 VM_BUG_ON(mc.moving_task); 4899 VM_BUG_ON(mc.moving_task);
4901 mc.moving_task = current; 4900 mc.moving_task = current;
4902 return mem_cgroup_do_precharge(precharge); 4901 return mem_cgroup_do_precharge(precharge);
4903 } 4902 }
4904 4903
4905 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ 4904 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4906 static void __mem_cgroup_clear_mc(void) 4905 static void __mem_cgroup_clear_mc(void)
4907 { 4906 {
4908 struct mem_cgroup *from = mc.from; 4907 struct mem_cgroup *from = mc.from;
4909 struct mem_cgroup *to = mc.to; 4908 struct mem_cgroup *to = mc.to;
4910 4909
4911 /* we must uncharge all the leftover precharges from mc.to */ 4910 /* we must uncharge all the leftover precharges from mc.to */
4912 if (mc.precharge) { 4911 if (mc.precharge) {
4913 __mem_cgroup_cancel_charge(mc.to, mc.precharge); 4912 __mem_cgroup_cancel_charge(mc.to, mc.precharge);
4914 mc.precharge = 0; 4913 mc.precharge = 0;
4915 } 4914 }
4916 /* 4915 /*
4917 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so 4916 * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4918 * we must uncharge here. 4917 * we must uncharge here.
4919 */ 4918 */
4920 if (mc.moved_charge) { 4919 if (mc.moved_charge) {
4921 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); 4920 __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
4922 mc.moved_charge = 0; 4921 mc.moved_charge = 0;
4923 } 4922 }
4924 /* we must fixup refcnts and charges */ 4923 /* we must fixup refcnts and charges */
4925 if (mc.moved_swap) { 4924 if (mc.moved_swap) {
4926 /* uncharge swap account from the old cgroup */ 4925 /* uncharge swap account from the old cgroup */
4927 if (!mem_cgroup_is_root(mc.from)) 4926 if (!mem_cgroup_is_root(mc.from))
4928 res_counter_uncharge(&mc.from->memsw, 4927 res_counter_uncharge(&mc.from->memsw,
4929 PAGE_SIZE * mc.moved_swap); 4928 PAGE_SIZE * mc.moved_swap);
4930 __mem_cgroup_put(mc.from, mc.moved_swap); 4929 __mem_cgroup_put(mc.from, mc.moved_swap);
4931 4930
4932 if (!mem_cgroup_is_root(mc.to)) { 4931 if (!mem_cgroup_is_root(mc.to)) {
4933 /* 4932 /*
4934 * we charged both to->res and to->memsw, so we should 4933 * we charged both to->res and to->memsw, so we should
4935 * uncharge to->res. 4934 * uncharge to->res.
4936 */ 4935 */
4937 res_counter_uncharge(&mc.to->res, 4936 res_counter_uncharge(&mc.to->res,
4938 PAGE_SIZE * mc.moved_swap); 4937 PAGE_SIZE * mc.moved_swap);
4939 } 4938 }
4940 /* we've already done mem_cgroup_get(mc.to) */ 4939 /* we've already done mem_cgroup_get(mc.to) */
4941 mc.moved_swap = 0; 4940 mc.moved_swap = 0;
4942 } 4941 }
4943 memcg_oom_recover(from); 4942 memcg_oom_recover(from);
4944 memcg_oom_recover(to); 4943 memcg_oom_recover(to);
4945 wake_up_all(&mc.waitq); 4944 wake_up_all(&mc.waitq);
4946 } 4945 }
4947 4946
4948 static void mem_cgroup_clear_mc(void) 4947 static void mem_cgroup_clear_mc(void)
4949 { 4948 {
4950 struct mem_cgroup *from = mc.from; 4949 struct mem_cgroup *from = mc.from;
4951 4950
4952 /* 4951 /*
4953 * we must clear moving_task before waking up waiters at the end of 4952 * we must clear moving_task before waking up waiters at the end of
4954 * task migration. 4953 * task migration.
4955 */ 4954 */
4956 mc.moving_task = NULL; 4955 mc.moving_task = NULL;
4957 __mem_cgroup_clear_mc(); 4956 __mem_cgroup_clear_mc();
4958 spin_lock(&mc.lock); 4957 spin_lock(&mc.lock);
4959 mc.from = NULL; 4958 mc.from = NULL;
4960 mc.to = NULL; 4959 mc.to = NULL;
4961 spin_unlock(&mc.lock); 4960 spin_unlock(&mc.lock);
4962 mem_cgroup_end_move(from); 4961 mem_cgroup_end_move(from);
4963 } 4962 }
4964 4963
4965 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 4964 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4966 struct cgroup *cgroup, 4965 struct cgroup *cgroup,
4967 struct task_struct *p) 4966 struct task_struct *p)
4968 { 4967 {
4969 int ret = 0; 4968 int ret = 0;
4970 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); 4969 struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
4971 4970
4972 if (mem->move_charge_at_immigrate) { 4971 if (mem->move_charge_at_immigrate) {
4973 struct mm_struct *mm; 4972 struct mm_struct *mm;
4974 struct mem_cgroup *from = mem_cgroup_from_task(p); 4973 struct mem_cgroup *from = mem_cgroup_from_task(p);
4975 4974
4976 VM_BUG_ON(from == mem); 4975 VM_BUG_ON(from == mem);
4977 4976
4978 mm = get_task_mm(p); 4977 mm = get_task_mm(p);
4979 if (!mm) 4978 if (!mm)
4980 return 0; 4979 return 0;
4981 /* We move charges only when we move a owner of the mm */ 4980 /* We move charges only when we move a owner of the mm */
4982 if (mm->owner == p) { 4981 if (mm->owner == p) {
4983 VM_BUG_ON(mc.from); 4982 VM_BUG_ON(mc.from);
4984 VM_BUG_ON(mc.to); 4983 VM_BUG_ON(mc.to);
4985 VM_BUG_ON(mc.precharge); 4984 VM_BUG_ON(mc.precharge);
4986 VM_BUG_ON(mc.moved_charge); 4985 VM_BUG_ON(mc.moved_charge);
4987 VM_BUG_ON(mc.moved_swap); 4986 VM_BUG_ON(mc.moved_swap);
4988 mem_cgroup_start_move(from); 4987 mem_cgroup_start_move(from);
4989 spin_lock(&mc.lock); 4988 spin_lock(&mc.lock);
4990 mc.from = from; 4989 mc.from = from;
4991 mc.to = mem; 4990 mc.to = mem;
4992 spin_unlock(&mc.lock); 4991 spin_unlock(&mc.lock);
4993 /* We set mc.moving_task later */ 4992 /* We set mc.moving_task later */
4994 4993
4995 ret = mem_cgroup_precharge_mc(mm); 4994 ret = mem_cgroup_precharge_mc(mm);
4996 if (ret) 4995 if (ret)
4997 mem_cgroup_clear_mc(); 4996 mem_cgroup_clear_mc();
4998 } 4997 }
4999 mmput(mm); 4998 mmput(mm);
5000 } 4999 }
5001 return ret; 5000 return ret;
5002 } 5001 }
5003 5002
5004 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5003 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5005 struct cgroup *cgroup, 5004 struct cgroup *cgroup,
5006 struct task_struct *p) 5005 struct task_struct *p)
5007 { 5006 {
5008 mem_cgroup_clear_mc(); 5007 mem_cgroup_clear_mc();
5009 } 5008 }
5010 5009
5011 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, 5010 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5012 unsigned long addr, unsigned long end, 5011 unsigned long addr, unsigned long end,
5013 struct mm_walk *walk) 5012 struct mm_walk *walk)
5014 { 5013 {
5015 int ret = 0; 5014 int ret = 0;
5016 struct vm_area_struct *vma = walk->private; 5015 struct vm_area_struct *vma = walk->private;
5017 pte_t *pte; 5016 pte_t *pte;
5018 spinlock_t *ptl; 5017 spinlock_t *ptl;
5019 5018
5020 split_huge_page_pmd(walk->mm, pmd); 5019 split_huge_page_pmd(walk->mm, pmd);
5021 retry: 5020 retry:
5022 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 5021 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5023 for (; addr != end; addr += PAGE_SIZE) { 5022 for (; addr != end; addr += PAGE_SIZE) {
5024 pte_t ptent = *(pte++); 5023 pte_t ptent = *(pte++);
5025 union mc_target target; 5024 union mc_target target;
5026 int type; 5025 int type;
5027 struct page *page; 5026 struct page *page;
5028 struct page_cgroup *pc; 5027 struct page_cgroup *pc;
5029 swp_entry_t ent; 5028 swp_entry_t ent;
5030 5029
5031 if (!mc.precharge) 5030 if (!mc.precharge)
5032 break; 5031 break;
5033 5032
5034 type = is_target_pte_for_mc(vma, addr, ptent, &target); 5033 type = is_target_pte_for_mc(vma, addr, ptent, &target);
5035 switch (type) { 5034 switch (type) {
5036 case MC_TARGET_PAGE: 5035 case MC_TARGET_PAGE:
5037 page = target.page; 5036 page = target.page;
5038 if (isolate_lru_page(page)) 5037 if (isolate_lru_page(page))
5039 goto put; 5038 goto put;
5040 pc = lookup_page_cgroup(page); 5039 pc = lookup_page_cgroup(page);
5041 if (!mem_cgroup_move_account(page, 1, pc, 5040 if (!mem_cgroup_move_account(page, 1, pc,
5042 mc.from, mc.to, false)) { 5041 mc.from, mc.to, false)) {
5043 mc.precharge--; 5042 mc.precharge--;
5044 /* we uncharge from mc.from later. */ 5043 /* we uncharge from mc.from later. */
5045 mc.moved_charge++; 5044 mc.moved_charge++;
5046 } 5045 }
5047 putback_lru_page(page); 5046 putback_lru_page(page);
5048 put: /* is_target_pte_for_mc() gets the page */ 5047 put: /* is_target_pte_for_mc() gets the page */
5049 put_page(page); 5048 put_page(page);
5050 break; 5049 break;
5051 case MC_TARGET_SWAP: 5050 case MC_TARGET_SWAP:
5052 ent = target.ent; 5051 ent = target.ent;
5053 if (!mem_cgroup_move_swap_account(ent, 5052 if (!mem_cgroup_move_swap_account(ent,
5054 mc.from, mc.to, false)) { 5053 mc.from, mc.to, false)) {
5055 mc.precharge--; 5054 mc.precharge--;
5056 /* we fixup refcnts and charges later. */ 5055 /* we fixup refcnts and charges later. */
5057 mc.moved_swap++; 5056 mc.moved_swap++;
5058 } 5057 }
5059 break; 5058 break;
5060 default: 5059 default:
5061 break; 5060 break;
5062 } 5061 }
5063 } 5062 }
5064 pte_unmap_unlock(pte - 1, ptl); 5063 pte_unmap_unlock(pte - 1, ptl);
5065 cond_resched(); 5064 cond_resched();
5066 5065
5067 if (addr != end) { 5066 if (addr != end) {
5068 /* 5067 /*
5069 * We have consumed all precharges we got in can_attach(). 5068 * We have consumed all precharges we got in can_attach().
5070 * We try charge one by one, but don't do any additional 5069 * We try charge one by one, but don't do any additional
5071 * charges to mc.to if we have failed in charge once in attach() 5070 * charges to mc.to if we have failed in charge once in attach()
5072 * phase. 5071 * phase.
5073 */ 5072 */
5074 ret = mem_cgroup_do_precharge(1); 5073 ret = mem_cgroup_do_precharge(1);
5075 if (!ret) 5074 if (!ret)
5076 goto retry; 5075 goto retry;
5077 } 5076 }
5078 5077
5079 return ret; 5078 return ret;
5080 } 5079 }
5081 5080
5082 static void mem_cgroup_move_charge(struct mm_struct *mm) 5081 static void mem_cgroup_move_charge(struct mm_struct *mm)
5083 { 5082 {
5084 struct vm_area_struct *vma; 5083 struct vm_area_struct *vma;
5085 5084
5086 lru_add_drain_all(); 5085 lru_add_drain_all();
5087 retry: 5086 retry:
5088 if (unlikely(!down_read_trylock(&mm->mmap_sem))) { 5087 if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5089 /* 5088 /*
5090 * Someone who are holding the mmap_sem might be waiting in 5089 * Someone who are holding the mmap_sem might be waiting in
5091 * waitq. So we cancel all extra charges, wake up all waiters, 5090 * waitq. So we cancel all extra charges, wake up all waiters,
5092 * and retry. Because we cancel precharges, we might not be able 5091 * and retry. Because we cancel precharges, we might not be able
5093 * to move enough charges, but moving charge is a best-effort 5092 * to move enough charges, but moving charge is a best-effort
5094 * feature anyway, so it wouldn't be a big problem. 5093 * feature anyway, so it wouldn't be a big problem.
5095 */ 5094 */
5096 __mem_cgroup_clear_mc(); 5095 __mem_cgroup_clear_mc();
5097 cond_resched(); 5096 cond_resched();
5098 goto retry; 5097 goto retry;
5099 } 5098 }
5100 for (vma = mm->mmap; vma; vma = vma->vm_next) { 5099 for (vma = mm->mmap; vma; vma = vma->vm_next) {
5101 int ret; 5100 int ret;
5102 struct mm_walk mem_cgroup_move_charge_walk = { 5101 struct mm_walk mem_cgroup_move_charge_walk = {
5103 .pmd_entry = mem_cgroup_move_charge_pte_range, 5102 .pmd_entry = mem_cgroup_move_charge_pte_range,
5104 .mm = mm, 5103 .mm = mm,
5105 .private = vma, 5104 .private = vma,
5106 }; 5105 };
5107 if (is_vm_hugetlb_page(vma)) 5106 if (is_vm_hugetlb_page(vma))
5108 continue; 5107 continue;
5109 ret = walk_page_range(vma->vm_start, vma->vm_end, 5108 ret = walk_page_range(vma->vm_start, vma->vm_end,
5110 &mem_cgroup_move_charge_walk); 5109 &mem_cgroup_move_charge_walk);
5111 if (ret) 5110 if (ret)
5112 /* 5111 /*
5113 * means we have consumed all precharges and failed in 5112 * means we have consumed all precharges and failed in
5114 * doing additional charge. Just abandon here. 5113 * doing additional charge. Just abandon here.
5115 */ 5114 */
5116 break; 5115 break;
5117 } 5116 }
5118 up_read(&mm->mmap_sem); 5117 up_read(&mm->mmap_sem);
5119 } 5118 }
5120 5119
5121 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5120 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5122 struct cgroup *cont, 5121 struct cgroup *cont,
5123 struct cgroup *old_cont, 5122 struct cgroup *old_cont,
5124 struct task_struct *p) 5123 struct task_struct *p)
5125 { 5124 {
5126 struct mm_struct *mm; 5125 struct mm_struct *mm;
5127 5126
5128 if (!mc.to) 5127 if (!mc.to)
5129 /* no need to move charge */ 5128 /* no need to move charge */
5130 return; 5129 return;
5131 5130
5132 mm = get_task_mm(p); 5131 mm = get_task_mm(p);
5133 if (mm) { 5132 if (mm) {
5134 mem_cgroup_move_charge(mm); 5133 mem_cgroup_move_charge(mm);
5135 mmput(mm); 5134 mmput(mm);
5136 } 5135 }
5137 mem_cgroup_clear_mc(); 5136 mem_cgroup_clear_mc();
5138 } 5137 }
5139 #else /* !CONFIG_MMU */ 5138 #else /* !CONFIG_MMU */
5140 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5139 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
5141 struct cgroup *cgroup, 5140 struct cgroup *cgroup,
5142 struct task_struct *p) 5141 struct task_struct *p)
5143 { 5142 {
5144 return 0; 5143 return 0;
5145 } 5144 }
5146 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5145 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
5147 struct cgroup *cgroup, 5146 struct cgroup *cgroup,
5148 struct task_struct *p) 5147 struct task_struct *p)
5149 { 5148 {
5150 } 5149 }
5151 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5150 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
5152 struct cgroup *cont, 5151 struct cgroup *cont,
5153 struct cgroup *old_cont, 5152 struct cgroup *old_cont,
5154 struct task_struct *p) 5153 struct task_struct *p)
5155 { 5154 {
5156 } 5155 }
5157 #endif 5156 #endif
5158 5157
5159 struct cgroup_subsys mem_cgroup_subsys = { 5158 struct cgroup_subsys mem_cgroup_subsys = {
5160 .name = "memory", 5159 .name = "memory",
5161 .subsys_id = mem_cgroup_subsys_id, 5160 .subsys_id = mem_cgroup_subsys_id,
5162 .create = mem_cgroup_create, 5161 .create = mem_cgroup_create,
5163 .pre_destroy = mem_cgroup_pre_destroy, 5162 .pre_destroy = mem_cgroup_pre_destroy,
5164 .destroy = mem_cgroup_destroy, 5163 .destroy = mem_cgroup_destroy,
5165 .populate = mem_cgroup_populate, 5164 .populate = mem_cgroup_populate,
5166 .can_attach = mem_cgroup_can_attach, 5165 .can_attach = mem_cgroup_can_attach,
5167 .cancel_attach = mem_cgroup_cancel_attach, 5166 .cancel_attach = mem_cgroup_cancel_attach,
5168 .attach = mem_cgroup_move_task, 5167 .attach = mem_cgroup_move_task,
5169 .early_init = 0, 5168 .early_init = 0,
5170 .use_id = 1, 5169 .use_id = 1,
5171 }; 5170 };
5172 5171
5173 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5172 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
5174 static int __init enable_swap_account(char *s) 5173 static int __init enable_swap_account(char *s)
5175 { 5174 {
5176 /* consider enabled if no parameter or 1 is given */ 5175 /* consider enabled if no parameter or 1 is given */
5177 if (!strcmp(s, "1")) 5176 if (!strcmp(s, "1"))
5178 really_do_swap_account = 1; 5177 really_do_swap_account = 1;
5179 else if (!strcmp(s, "0")) 5178 else if (!strcmp(s, "0"))
5180 really_do_swap_account = 0; 5179 really_do_swap_account = 0;
5181 return 1; 5180 return 1;
5182 } 5181 }
5183 __setup("swapaccount=", enable_swap_account); 5182 __setup("swapaccount=", enable_swap_account);
5184 5183
5185 #endif 5184 #endif
5186 5185